/*
 * intrinsic_mad.c
 *
 * Description of this file:
 *    SSE assembly functions of MAD-Calculating module of the xavs2 library
 *
 * --------------------------------------------------------------------------
 *
 *    xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard
 *    Copyright (C) 2018~ VCL, NELVT, Peking University
 *
 *    Authors: Falei LUO <falei.luo@gmail.com>
 *             etc.
 *
 *    Homepage1: http://vcl.idm.pku.edu.cn/xavs2
 *    Homepage2: https://github.com/pkuvcl/xavs2
 *    Homepage3: https://gitee.com/pkuvcl/xavs2
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 *    This program is also available under a commercial proprietary license.
 *    For more information, contact us at sswang @ pku.edu.cn.
 */

#include "../basic_types.h"
#include "intrinsic.h"

#include <mmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>


/* ---------------------------------------------------------------------------
*/
int mad_16x16_sse128(pel_t *p_src, int i_src, int cu_size)
{
    __m128i zero;
    __m128i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15;
    __m128i T0_0, T1_0, T2_0, T3_0, T4_0, T5_0, T6_0, T7_0, T8_0, T9_0, T10_0, T11_0, T12_0, T13_0, T14_0, T15_0;
    __m128i T0_1, T1_1, T2_1, T3_1, T4_1, T5_1, T6_1, T7_1, T8_1, T9_1, T10_1, T11_1, T12_1, T13_1, T14_1, T15_1;
    __m128i S;
    __m128i avg;
    __m128i M;
    int num_pix = cu_size * cu_size;
    int sum = 0;
    int f_avg = 0;                 /* average of all pixels in current block */
    int mad = 0;

    /* cal average */
    zero = _mm_set1_epi8(0);
    T0 = _mm_loadu_si128((__m128i *)p_src);
    T0_0 = _mm_unpacklo_epi8(T0, zero);
    T0_1 = _mm_unpackhi_epi8(T0, zero);
    T0 = _mm_add_epi16(T0_0, T0_1);
    T1 = _mm_loadu_si128((__m128i *)(p_src + i_src));
    T1_0 = _mm_unpacklo_epi8(T1, zero);
    T1_1 = _mm_unpackhi_epi8(T1, zero);
    T1 = _mm_add_epi16(T1_0, T1_1);

    T2 = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src));
    T2_0 = _mm_unpacklo_epi8(T2, zero);
    T2_1 = _mm_unpackhi_epi8(T2, zero);
    T2 = _mm_add_epi16(T2_0, T2_1);
    T3 = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src));
    T3_0 = _mm_unpacklo_epi8(T3, zero);
    T3_1 = _mm_unpackhi_epi8(T3, zero);
    T3 = _mm_add_epi16(T3_0, T3_1);

    T4 = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src));
    T4_0 = _mm_unpacklo_epi8(T4, zero);
    T4_1 = _mm_unpackhi_epi8(T4, zero);
    T4 = _mm_add_epi16(T4_0, T4_1);
    T5 = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src));
    T5_0 = _mm_unpacklo_epi8(T5, zero);
    T5_1 = _mm_unpackhi_epi8(T5, zero);
    T5 = _mm_add_epi16(T5_0, T5_1);

    T6 = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src));
    T6_0 = _mm_unpacklo_epi8(T6, zero);
    T6_1 = _mm_unpackhi_epi8(T6, zero);
    T6 = _mm_add_epi16(T6_0, T6_1);
    T7 = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src));
    T7_0 = _mm_unpacklo_epi8(T7, zero);
    T7_1 = _mm_unpackhi_epi8(T7, zero);
    T7 = _mm_add_epi16(T7_0, T7_1);

    T8 = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src));
    T8_0 = _mm_unpacklo_epi8(T8, zero);
    T8_1 = _mm_unpackhi_epi8(T8, zero);
    T8 = _mm_add_epi16(T8_0, T8_1);
    T9 = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src));
    T9_0 = _mm_unpacklo_epi8(T9, zero);
    T9_1 = _mm_unpackhi_epi8(T9, zero);
    T9 = _mm_add_epi16(T9_0, T9_1);

    T10 = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src));
    T10_0 = _mm_unpacklo_epi8(T10, zero);
    T10_1 = _mm_unpackhi_epi8(T10, zero);
    T10 = _mm_add_epi16(T10_0, T10_1);
    T11 = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src));
    T11_0 = _mm_unpacklo_epi8(T11, zero);
    T11_1 = _mm_unpackhi_epi8(T11, zero);
    T11 = _mm_add_epi16(T11_0, T11_1);

    T12 = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src));
    T12_0 = _mm_unpacklo_epi8(T12, zero);
    T12_1 = _mm_unpackhi_epi8(T12, zero);
    T12 = _mm_add_epi16(T12_0, T12_1);
    T13 = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src));
    T13_0 = _mm_unpacklo_epi8(T13, zero);
    T13_1 = _mm_unpackhi_epi8(T13, zero);
    T13 = _mm_add_epi16(T13_0, T13_1);

    T14 = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src));
    T14_0 = _mm_unpacklo_epi8(T14, zero);
    T14_1 = _mm_unpackhi_epi8(T14, zero);
    T14 = _mm_add_epi16(T14_0, T14_1);
    T15 = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src));
    T15_0 = _mm_unpacklo_epi8(T15, zero);
    T15_1 = _mm_unpackhi_epi8(T15, zero);
    T15 = _mm_add_epi16(T15_0, T15_1);

    S = _mm_add_epi16(T0, T1);
    S = _mm_add_epi16(S, T2);
    S = _mm_add_epi16(S, T3);
    S = _mm_add_epi16(S, T4);
    S = _mm_add_epi16(S, T5);
    S = _mm_add_epi16(S, T6);
    S = _mm_add_epi16(S, T7);
    S = _mm_add_epi16(S, T8);
    S = _mm_add_epi16(S, T9);
    S = _mm_add_epi16(S, T10);
    S = _mm_add_epi16(S, T11);
    S = _mm_add_epi16(S, T12);
    S = _mm_add_epi16(S, T13);
    S = _mm_add_epi16(S, T14);
    S = _mm_add_epi16(S, T15);

    sum = M128_I16(S, 0) + M128_I16(S, 1) + M128_I16(S, 2) + M128_I16(S, 3) + M128_I16(S, 4) + M128_I16(S, 5) + M128_I16(S, 6) + M128_I16(S, 7);
    f_avg = (sum + (num_pix >> 1)) / num_pix;

    avg = _mm_set1_epi16((short)f_avg);

    /* cal mad */
    T0_0 = _mm_sub_epi16(T0_0, avg);
    T0_1 = _mm_sub_epi16(T0_1, avg);
    T1_0 = _mm_sub_epi16(T1_0, avg);
    T1_1 = _mm_sub_epi16(T1_1, avg);
    T2_0 = _mm_sub_epi16(T2_0, avg);
    T2_1 = _mm_sub_epi16(T2_1, avg);
    T3_0 = _mm_sub_epi16(T3_0, avg);
    T3_1 = _mm_sub_epi16(T3_1, avg);
    T4_0 = _mm_sub_epi16(T4_0, avg);
    T4_1 = _mm_sub_epi16(T4_1, avg);
    T5_0 = _mm_sub_epi16(T5_0, avg);
    T5_1 = _mm_sub_epi16(T5_1, avg);
    T6_0 = _mm_sub_epi16(T6_0, avg);
    T6_1 = _mm_sub_epi16(T6_1, avg);
    T7_0 = _mm_sub_epi16(T7_0, avg);
    T7_1 = _mm_sub_epi16(T7_1, avg);
    T8_0 = _mm_sub_epi16(T8_0, avg);
    T8_1 = _mm_sub_epi16(T8_1, avg);
    T9_0 = _mm_sub_epi16(T9_0, avg);
    T9_1 = _mm_sub_epi16(T9_1, avg);
    T10_0 = _mm_sub_epi16(T10_0, avg);
    T10_1 = _mm_sub_epi16(T10_1, avg);
    T11_0 = _mm_sub_epi16(T11_0, avg);
    T11_1 = _mm_sub_epi16(T11_1, avg);
    T12_0 = _mm_sub_epi16(T12_0, avg);
    T12_1 = _mm_sub_epi16(T12_1, avg);
    T13_0 = _mm_sub_epi16(T13_0, avg);
    T13_1 = _mm_sub_epi16(T13_1, avg);
    T14_0 = _mm_sub_epi16(T14_0, avg);
    T14_1 = _mm_sub_epi16(T14_1, avg);
    T15_0 = _mm_sub_epi16(T15_0, avg);
    T15_1 = _mm_sub_epi16(T15_1, avg);
    T0_0 = _mm_abs_epi16(T0_0);
    T0_1 = _mm_abs_epi16(T0_1);
    T1_0 = _mm_abs_epi16(T1_0);
    T1_1 = _mm_abs_epi16(T1_1);
    T2_0 = _mm_abs_epi16(T2_0);
    T2_1 = _mm_abs_epi16(T2_1);
    T3_0 = _mm_abs_epi16(T3_0);
    T3_1 = _mm_abs_epi16(T3_1);
    T4_0 = _mm_abs_epi16(T4_0);
    T4_1 = _mm_abs_epi16(T4_1);
    T5_0 = _mm_abs_epi16(T5_0);
    T5_1 = _mm_abs_epi16(T5_1);
    T6_0 = _mm_abs_epi16(T6_0);
    T6_1 = _mm_abs_epi16(T6_1);
    T7_0 = _mm_abs_epi16(T7_0);
    T7_1 = _mm_abs_epi16(T7_1);
    T8_0 = _mm_abs_epi16(T8_0);
    T8_1 = _mm_abs_epi16(T8_1);
    T9_0 = _mm_abs_epi16(T9_0);
    T9_1 = _mm_abs_epi16(T9_1);
    T10_0 = _mm_abs_epi16(T10_0);
    T10_1 = _mm_abs_epi16(T10_1);
    T11_0 = _mm_abs_epi16(T11_0);
    T11_1 = _mm_abs_epi16(T11_1);
    T12_0 = _mm_abs_epi16(T12_0);
    T12_1 = _mm_abs_epi16(T12_1);
    T13_0 = _mm_abs_epi16(T13_0);
    T13_1 = _mm_abs_epi16(T13_1);
    T14_0 = _mm_abs_epi16(T14_0);
    T14_1 = _mm_abs_epi16(T14_1);
    T15_0 = _mm_abs_epi16(T15_0);
    T15_1 = _mm_abs_epi16(T15_1);

    T0 = _mm_add_epi16(T0_0, T0_1);
    T1 = _mm_add_epi16(T1_0, T1_1);
    T2 = _mm_add_epi16(T2_0, T2_1);
    T3 = _mm_add_epi16(T3_0, T3_1);
    T4 = _mm_add_epi16(T4_0, T4_1);
    T5 = _mm_add_epi16(T5_0, T5_1);
    T6 = _mm_add_epi16(T6_0, T6_1);
    T7 = _mm_add_epi16(T7_0, T7_1);
    T8 = _mm_add_epi16(T8_0, T8_1);
    T9 = _mm_add_epi16(T9_0, T9_1);
    T10 = _mm_add_epi16(T10_0, T10_1);
    T11 = _mm_add_epi16(T11_0, T11_1);
    T12 = _mm_add_epi16(T12_0, T12_1);
    T13 = _mm_add_epi16(T13_0, T13_1);
    T14 = _mm_add_epi16(T14_0, T14_1);
    T15 = _mm_add_epi16(T15_0, T15_1);

    M = _mm_add_epi16(T0, T1);
    M = _mm_add_epi16(M, T2);
    M = _mm_add_epi16(M, T3);
    M = _mm_add_epi16(M, T4);
    M = _mm_add_epi16(M, T5);
    M = _mm_add_epi16(M, T6);
    M = _mm_add_epi16(M, T7);
    M = _mm_add_epi16(M, T8);
    M = _mm_add_epi16(M, T9);
    M = _mm_add_epi16(M, T10);
    M = _mm_add_epi16(M, T11);
    M = _mm_add_epi16(M, T12);
    M = _mm_add_epi16(M, T13);
    M = _mm_add_epi16(M, T14);
    M = _mm_add_epi16(M, T15);

    mad = M128_U16(S, 0) + M128_U16(S, 1) + M128_U16(S, 2) + M128_U16(S, 3) + M128_U16(S, 4) + M128_U16(S, 5) + M128_U16(S, 6) + M128_U16(S, 7);

    return mad;
}

/* ---------------------------------------------------------------------------
*/
int mad_32x32_sse128(pel_t *p_src, int i_src, int cu_size)
{
    __m128i zero;
    __m128i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, T31;
    __m128i T0A, T1A, T2A, T3A, T4A, T5A, T6A, T7A, T8A, T9A, T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A, T18A, T19A, T20A, T21A, T22A, T23A, T24A, T25A, T26A, T27A, T28A, T29A, T30A, T31A;
    __m128i T0B, T1B, T2B, T3B, T4B, T5B, T6B, T7B, T8B, T9B, T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B, T18B, T19B, T20B, T21B, T22B, T23B, T24B, T25B, T26B, T27B, T28B, T29B, T30B, T31B;
    __m128i T0_0A, T1_0A, T2_0A, T3_0A, T4_0A, T5_0A, T6_0A, T7_0A, T8_0A, T9_0A, T10_0A, T11_0A, T12_0A, T13_0A, T14_0A, T15_0A, T16_0A, T17_0A, T18_0A, T19_0A, T20_0A, T21_0A, T22_0A, T23_0A, T24_0A, T25_0A, T26_0A, T27_0A, T28_0A, T29_0A, T30_0A, T31_0A;
    __m128i T0_1A, T1_1A, T2_1A, T3_1A, T4_1A, T5_1A, T6_1A, T7_1A, T8_1A, T9_1A, T10_1A, T11_1A, T12_1A, T13_1A, T14_1A, T15_1A, T16_1A, T17_1A, T18_1A, T19_1A, T20_1A, T21_1A, T22_1A, T23_1A, T24_1A, T25_1A, T26_1A, T27_1A, T28_1A, T29_1A, T30_1A, T31_1A;
    __m128i T0_0B, T1_0B, T2_0B, T3_0B, T4_0B, T5_0B, T6_0B, T7_0B, T8_0B, T9_0B, T10_0B, T11_0B, T12_0B, T13_0B, T14_0B, T15_0B, T16_0B, T17_0B, T18_0B, T19_0B, T20_0B, T21_0B, T22_0B, T23_0B, T24_0B, T25_0B, T26_0B, T27_0B, T28_0B, T29_0B, T30_0B, T31_0B;
    __m128i T0_1B, T1_1B, T2_1B, T3_1B, T4_1B, T5_1B, T6_1B, T7_1B, T8_1B, T9_1B, T10_1B, T11_1B, T12_1B, T13_1B, T14_1B, T15_1B, T16_1B, T17_1B, T18_1B, T19_1B, T20_1B, T21_1B, T22_1B, T23_1B, T24_1B, T25_1B, T26_1B, T27_1B, T28_1B, T29_1B, T30_1B, T31_1B;
    __m128i S;
    __m128i avg;
    __m128i M;
    int num_pix = cu_size * cu_size;
    int sum = 0;
    int f_avg = 0;                 /* average of all pixels in current block */
    int mad = 0;

    /* cal average */
    zero = _mm_set1_epi8(0);
    T0A = _mm_loadu_si128((__m128i *)p_src);
    T0_0A = _mm_unpacklo_epi8(T0A, zero);
    T0_1A = _mm_unpackhi_epi8(T0A, zero);
    T0A = _mm_add_epi16(T0_0A, T0_1A);
    T0B = _mm_loadu_si128((__m128i *)(p_src + 16));
    T0_0B = _mm_unpacklo_epi8(T0B, zero);
    T0_1B = _mm_unpackhi_epi8(T0B, zero);
    T0B = _mm_add_epi16(T0_0B, T0_1B);
    T0 = _mm_add_epi16(T0A, T0B);

    T1A = _mm_loadu_si128((__m128i *)(p_src + i_src));
    T1_0A = _mm_unpacklo_epi8(T1A, zero);
    T1_1A = _mm_unpackhi_epi8(T1A, zero);
    T1A = _mm_add_epi16(T1_0A, T1_1A);
    T1B = _mm_loadu_si128((__m128i *)(p_src + i_src + 16));
    T1_0B = _mm_unpacklo_epi8(T1B, zero);
    T1_1B = _mm_unpackhi_epi8(T1B, zero);
    T1B = _mm_add_epi16(T1_0B, T1_1B);
    T1 = _mm_add_epi16(T1A, T1B);

    T2A = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src));
    T2_0A = _mm_unpacklo_epi8(T2A, zero);
    T2_1A = _mm_unpackhi_epi8(T2A, zero);
    T2A = _mm_add_epi16(T2_0A, T2_1A);
    T2B = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src + 16));
    T2_0B = _mm_unpacklo_epi8(T2B, zero);
    T2_1B = _mm_unpackhi_epi8(T2B, zero);
    T2B = _mm_add_epi16(T2_0B, T2_1B);
    T2 = _mm_add_epi16(T2A, T2B);

    T3A = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src));
    T3_0A = _mm_unpacklo_epi8(T3A, zero);
    T3_1A = _mm_unpackhi_epi8(T3A, zero);
    T3A = _mm_add_epi16(T3_0A, T3_1A);
    T3B = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src + 16));
    T3_0B = _mm_unpacklo_epi8(T3B, zero);
    T3_1B = _mm_unpackhi_epi8(T3B, zero);
    T3B = _mm_add_epi16(T3_0B, T3_1B);
    T3 = _mm_add_epi16(T3A, T3B);

    T4A = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src));
    T4_0A = _mm_unpacklo_epi8(T4A, zero);
    T4_1A = _mm_unpackhi_epi8(T4A, zero);
    T4A = _mm_add_epi16(T4_0A, T4_1A);
    T4B = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src + 16));
    T4_0B = _mm_unpacklo_epi8(T4B, zero);
    T4_1B = _mm_unpackhi_epi8(T4B, zero);
    T4B = _mm_add_epi16(T4_0B, T4_1B);
    T4 = _mm_add_epi16(T4A, T4B);

    T5A = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src));
    T5_0A = _mm_unpacklo_epi8(T5A, zero);
    T5_1A = _mm_unpackhi_epi8(T5A, zero);
    T5A = _mm_add_epi16(T5_0A, T5_1A);
    T5B = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src + 16));
    T5_0B = _mm_unpacklo_epi8(T5B, zero);
    T5_1B = _mm_unpackhi_epi8(T5B, zero);
    T5B = _mm_add_epi16(T5_0B, T5_1B);
    T5 = _mm_add_epi16(T5A, T5B);

    T6A = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src));
    T6_0A = _mm_unpacklo_epi8(T6A, zero);
    T6_1A = _mm_unpackhi_epi8(T6A, zero);
    T6A = _mm_add_epi16(T6_0A, T6_1A);
    T6B = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src + 16));
    T6_0B = _mm_unpacklo_epi8(T6B, zero);
    T6_1B = _mm_unpackhi_epi8(T6B, zero);
    T6B = _mm_add_epi16(T6_0B, T6_1B);
    T6 = _mm_add_epi16(T6A, T6B);

    T7A = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src));
    T7_0A = _mm_unpacklo_epi8(T7A, zero);
    T7_1A = _mm_unpackhi_epi8(T7A, zero);
    T7A = _mm_add_epi16(T7_0A, T7_1A);
    T7B = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src + 16));
    T7_0B = _mm_unpacklo_epi8(T7B, zero);
    T7_1B = _mm_unpackhi_epi8(T7B, zero);
    T7B = _mm_add_epi16(T7_0B, T7_1B);
    T7 = _mm_add_epi16(T7A, T7B);

    T8A = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src));
    T8_0A = _mm_unpacklo_epi8(T8A, zero);
    T8_1A = _mm_unpackhi_epi8(T8A, zero);
    T8A = _mm_add_epi16(T8_0A, T8_1A);
    T8B = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src + 16));
    T8_0B = _mm_unpacklo_epi8(T8B, zero);
    T8_1B = _mm_unpackhi_epi8(T8B, zero);
    T8B = _mm_add_epi16(T8_0B, T8_1B);
    T8 = _mm_add_epi16(T8A, T8B);

    T9A = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src));
    T9_0A = _mm_unpacklo_epi8(T9A, zero);
    T9_1A = _mm_unpackhi_epi8(T9A, zero);
    T9A = _mm_add_epi16(T9_0A, T9_1A);
    T9B = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src + 16));
    T9_0B = _mm_unpacklo_epi8(T9B, zero);
    T9_1B = _mm_unpackhi_epi8(T9B, zero);
    T9B = _mm_add_epi16(T9_0B, T9_1B);
    T9 = _mm_add_epi16(T9A, T9B);

    T10A = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src));
    T10_0A = _mm_unpacklo_epi8(T10A, zero);
    T10_1A = _mm_unpackhi_epi8(T10A, zero);
    T10A = _mm_add_epi16(T10_0A, T10_1A);
    T10B = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src + 16));
    T10_0B = _mm_unpacklo_epi8(T10B, zero);
    T10_1B = _mm_unpackhi_epi8(T10B, zero);
    T10B = _mm_add_epi16(T10_0B, T10_1B);
    T10 = _mm_add_epi16(T10A, T10B);

    T11A = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src));
    T11_0A = _mm_unpacklo_epi8(T11A, zero);
    T11_1A = _mm_unpackhi_epi8(T11A, zero);
    T11A = _mm_add_epi16(T11_0A, T11_1A);
    T11B = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src + 16));
    T11_0B = _mm_unpacklo_epi8(T11B, zero);
    T11_1B = _mm_unpackhi_epi8(T11B, zero);
    T11B = _mm_add_epi16(T11_0B, T11_1B);
    T11 = _mm_add_epi16(T11A, T11B);

    T12A = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src));
    T12_0A = _mm_unpacklo_epi8(T12A, zero);
    T12_1A = _mm_unpackhi_epi8(T12A, zero);
    T12A = _mm_add_epi16(T12_0A, T12_1A);
    T12B = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src + 16));
    T12_0B = _mm_unpacklo_epi8(T12B, zero);
    T12_1B = _mm_unpackhi_epi8(T12B, zero);
    T12B = _mm_add_epi16(T12_0B, T12_1B);
    T12 = _mm_add_epi16(T12A, T12B);

    T13A = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src));
    T13_0A = _mm_unpacklo_epi8(T13A, zero);
    T13_1A = _mm_unpackhi_epi8(T13A, zero);
    T13A = _mm_add_epi16(T13_0A, T13_1A);
    T13B = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src + 16));
    T13_0B = _mm_unpacklo_epi8(T13B, zero);
    T13_1B = _mm_unpackhi_epi8(T13B, zero);
    T13B = _mm_add_epi16(T13_0B, T13_1B);
    T13 = _mm_add_epi16(T13A, T13B);

    T14A = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src));
    T14_0A = _mm_unpacklo_epi8(T14A, zero);
    T14_1A = _mm_unpackhi_epi8(T14A, zero);
    T14A = _mm_add_epi16(T14_0A, T14_1A);
    T14B = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src + 16));
    T14_0B = _mm_unpacklo_epi8(T14B, zero);
    T14_1B = _mm_unpackhi_epi8(T14B, zero);
    T14B = _mm_add_epi16(T14_0B, T14_1B);
    T14 = _mm_add_epi16(T14A, T14B);

    T15A = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src));
    T15_0A = _mm_unpacklo_epi8(T15A, zero);
    T15_1A = _mm_unpackhi_epi8(T15A, zero);
    T15A = _mm_add_epi16(T15_0A, T15_1A);
    T15B = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src + 16));
    T15_0B = _mm_unpacklo_epi8(T15B, zero);
    T15_1B = _mm_unpackhi_epi8(T15B, zero);
    T15B = _mm_add_epi16(T15_0B, T15_1B);
    T15 = _mm_add_epi16(T15A, T15B);

    T16A = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src));
    T16_0A = _mm_unpacklo_epi8(T16A, zero);
    T16_1A = _mm_unpackhi_epi8(T16A, zero);
    T16A = _mm_add_epi16(T16_0A, T16_1A);
    T16B = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src + 16));
    T16_0B = _mm_unpacklo_epi8(T16B, zero);
    T16_1B = _mm_unpackhi_epi8(T16B, zero);
    T16B = _mm_add_epi16(T16_0B, T16_1B);
    T16 = _mm_add_epi16(T16A, T16B);

    T17A = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src));
    T17_0A = _mm_unpacklo_epi8(T17A, zero);
    T17_1A = _mm_unpackhi_epi8(T17A, zero);
    T17A = _mm_add_epi16(T17_0A, T17_1A);
    T17B = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src + 16));
    T17_0B = _mm_unpacklo_epi8(T17B, zero);
    T17_1B = _mm_unpackhi_epi8(T17B, zero);
    T17B = _mm_add_epi16(T17_0B, T17_1B);
    T17 = _mm_add_epi16(T17A, T17B);

    T18A = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src));
    T18_0A = _mm_unpacklo_epi8(T18A, zero);
    T18_1A = _mm_unpackhi_epi8(T18A, zero);
    T18A = _mm_add_epi16(T18_0A, T18_1A);
    T18B = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src + 16));
    T18_0B = _mm_unpacklo_epi8(T18B, zero);
    T18_1B = _mm_unpackhi_epi8(T18B, zero);
    T18B = _mm_add_epi16(T18_0B, T18_1B);
    T18 = _mm_add_epi16(T18A, T18B);

    T19A = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src));
    T19_0A = _mm_unpacklo_epi8(T19A, zero);
    T19_1A = _mm_unpackhi_epi8(T19A, zero);
    T19A = _mm_add_epi16(T19_0A, T19_1A);
    T19B = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src + 16));
    T19_0B = _mm_unpacklo_epi8(T19B, zero);
    T19_1B = _mm_unpackhi_epi8(T19B, zero);
    T19B = _mm_add_epi16(T19_0B, T19_1B);
    T19 = _mm_add_epi16(T19A, T19B);

    T20A = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src));
    T20_0A = _mm_unpacklo_epi8(T20A, zero);
    T20_1A = _mm_unpackhi_epi8(T20A, zero);
    T20A = _mm_add_epi16(T20_0A, T20_1A);
    T20B = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src + 16));
    T20_0B = _mm_unpacklo_epi8(T20B, zero);
    T20_1B = _mm_unpackhi_epi8(T20B, zero);
    T20B = _mm_add_epi16(T20_0B, T20_1B);
    T20 = _mm_add_epi16(T20A, T20B);

    T21A = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src));
    T21_0A = _mm_unpacklo_epi8(T21A, zero);
    T21_1A = _mm_unpackhi_epi8(T21A, zero);
    T21A = _mm_add_epi16(T21_0A, T21_1A);
    T21B = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src + 16));
    T21_0B = _mm_unpacklo_epi8(T21B, zero);
    T21_1B = _mm_unpackhi_epi8(T21B, zero);
    T21B = _mm_add_epi16(T21_0B, T21_1B);
    T21 = _mm_add_epi16(T21A, T21B);

    T22A = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src));
    T22_0A = _mm_unpacklo_epi8(T22A, zero);
    T22_1A = _mm_unpackhi_epi8(T22A, zero);
    T22A = _mm_add_epi16(T22_0A, T22_1A);
    T22B = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src + 16));
    T22_0B = _mm_unpacklo_epi8(T22B, zero);
    T22_1B = _mm_unpackhi_epi8(T22B, zero);
    T22B = _mm_add_epi16(T22_0B, T22_1B);
    T22 = _mm_add_epi16(T22A, T22B);

    T23A = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src));
    T23_0A = _mm_unpacklo_epi8(T23A, zero);
    T23_1A = _mm_unpackhi_epi8(T23A, zero);
    T23A = _mm_add_epi16(T23_0A, T23_1A);
    T23B = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src + 16));
    T23_0B = _mm_unpacklo_epi8(T23B, zero);
    T23_1B = _mm_unpackhi_epi8(T23B, zero);
    T23B = _mm_add_epi16(T23_0B, T23_1B);
    T23 = _mm_add_epi16(T23A, T23B);

    T24A = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src));
    T24_0A = _mm_unpacklo_epi8(T24A, zero);
    T24_1A = _mm_unpackhi_epi8(T24A, zero);
    T24A = _mm_add_epi16(T24_0A, T24_1A);
    T24B = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src + 16));
    T24_0B = _mm_unpacklo_epi8(T24B, zero);
    T24_1B = _mm_unpackhi_epi8(T24B, zero);
    T24B = _mm_add_epi16(T24_0B, T24_1B);
    T24 = _mm_add_epi16(T24A, T24B);

    T25A = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src));
    T25_0A = _mm_unpacklo_epi8(T25A, zero);
    T25_1A = _mm_unpackhi_epi8(T25A, zero);
    T25A = _mm_add_epi16(T25_0A, T25_1A);
    T25B = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src + 16));
    T25_0B = _mm_unpacklo_epi8(T25B, zero);
    T25_1B = _mm_unpackhi_epi8(T25B, zero);
    T25B = _mm_add_epi16(T25_0B, T25_1B);
    T25 = _mm_add_epi16(T25A, T25B);

    T26A = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src));
    T26_0A = _mm_unpacklo_epi8(T26A, zero);
    T26_1A = _mm_unpackhi_epi8(T26A, zero);
    T26A = _mm_add_epi16(T26_0A, T26_1A);
    T26B = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src + 16));
    T26_0B = _mm_unpacklo_epi8(T26B, zero);
    T26_1B = _mm_unpackhi_epi8(T26B, zero);
    T26B = _mm_add_epi16(T26_0B, T26_1B);
    T26 = _mm_add_epi16(T26A, T26B);

    T27A = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src));
    T27_0A = _mm_unpacklo_epi8(T27A, zero);
    T27_1A = _mm_unpackhi_epi8(T27A, zero);
    T27A = _mm_add_epi16(T27_0A, T27_1A);
    T27B = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src + 16));
    T27_0B = _mm_unpacklo_epi8(T27B, zero);
    T27_1B = _mm_unpackhi_epi8(T27B, zero);
    T27B = _mm_add_epi16(T27_0B, T27_1B);
    T27 = _mm_add_epi16(T27A, T27B);

    T28A = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src));
    T28_0A = _mm_unpacklo_epi8(T28A, zero);
    T28_1A = _mm_unpackhi_epi8(T28A, zero);
    T28A = _mm_add_epi16(T28_0A, T28_1A);
    T28B = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src + 16));
    T28_0B = _mm_unpacklo_epi8(T28B, zero);
    T28_1B = _mm_unpackhi_epi8(T28B, zero);
    T28B = _mm_add_epi16(T28_0B, T28_1B);
    T28 = _mm_add_epi16(T28A, T28B);

    T29A = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src));
    T29_0A = _mm_unpacklo_epi8(T29A, zero);
    T29_1A = _mm_unpackhi_epi8(T29A, zero);
    T29A = _mm_add_epi16(T29_0A, T29_1A);
    T29B = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src + 16));
    T29_0B = _mm_unpacklo_epi8(T29B, zero);
    T29_1B = _mm_unpackhi_epi8(T29B, zero);
    T29B = _mm_add_epi16(T29_0B, T29_1B);
    T29 = _mm_add_epi16(T29A, T29B);

    T30A = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src));
    T30_0A = _mm_unpacklo_epi8(T30A, zero);
    T30_1A = _mm_unpackhi_epi8(T30A, zero);
    T30A = _mm_add_epi16(T30_0A, T30_1A);
    T30B = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src + 16));
    T30_0B = _mm_unpacklo_epi8(T30B, zero);
    T30_1B = _mm_unpackhi_epi8(T30B, zero);
    T30B = _mm_add_epi16(T30_0B, T30_1B);
    T30 = _mm_add_epi16(T30A, T30B);

    T31A = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src));
    T31_0A = _mm_unpacklo_epi8(T31A, zero);
    T31_1A = _mm_unpackhi_epi8(T31A, zero);
    T31A = _mm_add_epi16(T31_0A, T31_1A);
    T31B = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src + 16));
    T31_0B = _mm_unpacklo_epi8(T31B, zero);
    T31_1B = _mm_unpackhi_epi8(T31B, zero);
    T31B = _mm_add_epi16(T31_0B, T31_1B);
    T31 = _mm_add_epi16(T31A, T31B);

    S = _mm_add_epi16(T0, T1);
    S = _mm_add_epi16(S, T2);
    S = _mm_add_epi16(S, T3);
    S = _mm_add_epi16(S, T4);
    S = _mm_add_epi16(S, T5);
    S = _mm_add_epi16(S, T6);
    S = _mm_add_epi16(S, T7);
    S = _mm_add_epi16(S, T8);
    S = _mm_add_epi16(S, T9);
    S = _mm_add_epi16(S, T10);
    S = _mm_add_epi16(S, T11);
    S = _mm_add_epi16(S, T12);
    S = _mm_add_epi16(S, T13);
    S = _mm_add_epi16(S, T14);
    S = _mm_add_epi16(S, T15);
    S = _mm_add_epi16(S, T16);
    S = _mm_add_epi16(S, T17);
    S = _mm_add_epi16(S, T18);
    S = _mm_add_epi16(S, T19);
    S = _mm_add_epi16(S, T20);
    S = _mm_add_epi16(S, T21);
    S = _mm_add_epi16(S, T22);
    S = _mm_add_epi16(S, T23);
    S = _mm_add_epi16(S, T24);
    S = _mm_add_epi16(S, T25);
    S = _mm_add_epi16(S, T26);
    S = _mm_add_epi16(S, T27);
    S = _mm_add_epi16(S, T28);
    S = _mm_add_epi16(S, T29);
    S = _mm_add_epi16(S, T30);
    S = _mm_add_epi16(S, T31);

    sum = M128_I16(S, 0) + M128_I16(S, 1) + M128_I16(S, 2) + M128_I16(S, 3) + M128_I16(S, 4) + M128_I16(S, 5) + M128_I16(S, 6) + M128_I16(S, 7);
    f_avg = (sum + (num_pix >> 1)) / num_pix;

    avg = _mm_set1_epi16((short)f_avg);

    /* cal mad */
    T0_0A = _mm_sub_epi16(T0_0A, avg);
    T0_1A = _mm_sub_epi16(T0_1A, avg);
    T0_0B = _mm_sub_epi16(T0_0B, avg);
    T0_1B = _mm_sub_epi16(T0_1B, avg);
    T1_0A = _mm_sub_epi16(T1_0A, avg);
    T1_1A = _mm_sub_epi16(T1_1A, avg);
    T1_0B = _mm_sub_epi16(T1_0B, avg);
    T1_1B = _mm_sub_epi16(T1_1B, avg);
    T2_0A = _mm_sub_epi16(T2_0A, avg);
    T2_1A = _mm_sub_epi16(T2_1A, avg);
    T2_0B = _mm_sub_epi16(T2_0B, avg);
    T2_1B = _mm_sub_epi16(T2_1B, avg);
    T3_0A = _mm_sub_epi16(T3_0A, avg);
    T3_1A = _mm_sub_epi16(T3_1A, avg);
    T3_0B = _mm_sub_epi16(T3_0B, avg);
    T3_1B = _mm_sub_epi16(T3_1B, avg);
    T4_0A = _mm_sub_epi16(T4_0A, avg);
    T4_1A = _mm_sub_epi16(T4_1A, avg);
    T4_0B = _mm_sub_epi16(T4_0B, avg);
    T4_1B = _mm_sub_epi16(T4_1B, avg);
    T5_0A = _mm_sub_epi16(T5_0A, avg);
    T5_1A = _mm_sub_epi16(T5_1A, avg);
    T5_0B = _mm_sub_epi16(T5_0B, avg);
    T5_1B = _mm_sub_epi16(T5_1B, avg);
    T6_0A = _mm_sub_epi16(T6_0A, avg);
    T6_1A = _mm_sub_epi16(T6_1A, avg);
    T6_0B = _mm_sub_epi16(T6_0B, avg);
    T6_1B = _mm_sub_epi16(T6_1B, avg);
    T7_0A = _mm_sub_epi16(T7_0A, avg);
    T7_1A = _mm_sub_epi16(T7_1A, avg);
    T7_0B = _mm_sub_epi16(T7_0B, avg);
    T7_1B = _mm_sub_epi16(T7_1B, avg);
    T8_0A = _mm_sub_epi16(T8_0A, avg);
    T8_1A = _mm_sub_epi16(T8_1A, avg);
    T8_0B = _mm_sub_epi16(T8_0B, avg);
    T8_1B = _mm_sub_epi16(T8_1B, avg);
    T9_0A = _mm_sub_epi16(T9_0A, avg);
    T9_1A = _mm_sub_epi16(T9_1A, avg);
    T9_0B = _mm_sub_epi16(T9_0B, avg);
    T9_1B = _mm_sub_epi16(T9_1B, avg);
    T10_0A = _mm_sub_epi16(T10_0A, avg);
    T10_1A = _mm_sub_epi16(T10_1A, avg);
    T10_0B = _mm_sub_epi16(T10_0B, avg);
    T10_1B = _mm_sub_epi16(T10_1B, avg);
    T11_0A = _mm_sub_epi16(T11_0A, avg);
    T11_1A = _mm_sub_epi16(T11_1A, avg);
    T11_0B = _mm_sub_epi16(T11_0B, avg);
    T11_1B = _mm_sub_epi16(T11_1B, avg);
    T12_0A = _mm_sub_epi16(T12_0A, avg);
    T12_1A = _mm_sub_epi16(T12_1A, avg);
    T12_0B = _mm_sub_epi16(T12_0B, avg);
    T12_1B = _mm_sub_epi16(T12_1B, avg);
    T13_0A = _mm_sub_epi16(T13_0A, avg);
    T13_1A = _mm_sub_epi16(T13_1A, avg);
    T13_0B = _mm_sub_epi16(T13_0B, avg);
    T13_1B = _mm_sub_epi16(T13_1B, avg);
    T14_0A = _mm_sub_epi16(T14_0A, avg);
    T14_1A = _mm_sub_epi16(T14_1A, avg);
    T14_0B = _mm_sub_epi16(T14_0B, avg);
    T14_1B = _mm_sub_epi16(T14_1B, avg);
    T15_0A = _mm_sub_epi16(T15_0A, avg);
    T15_1A = _mm_sub_epi16(T15_1A, avg);
    T15_0B = _mm_sub_epi16(T15_0B, avg);
    T15_1B = _mm_sub_epi16(T15_1B, avg);
    T16_0A = _mm_sub_epi16(T16_0A, avg);
    T16_1A = _mm_sub_epi16(T16_1A, avg);
    T16_0B = _mm_sub_epi16(T16_0B, avg);
    T16_1B = _mm_sub_epi16(T16_1B, avg);
    T17_0A = _mm_sub_epi16(T17_0A, avg);
    T17_1A = _mm_sub_epi16(T17_1A, avg);
    T17_0B = _mm_sub_epi16(T17_0B, avg);
    T17_1B = _mm_sub_epi16(T17_1B, avg);
    T18_0A = _mm_sub_epi16(T18_0A, avg);
    T18_1A = _mm_sub_epi16(T18_1A, avg);
    T18_0B = _mm_sub_epi16(T18_0B, avg);
    T18_1B = _mm_sub_epi16(T18_1B, avg);
    T19_0A = _mm_sub_epi16(T19_0A, avg);
    T19_1A = _mm_sub_epi16(T19_1A, avg);
    T19_0B = _mm_sub_epi16(T19_0B, avg);
    T19_1B = _mm_sub_epi16(T19_1B, avg);
    T20_0A = _mm_sub_epi16(T20_0A, avg);
    T20_1A = _mm_sub_epi16(T20_1A, avg);
    T20_0B = _mm_sub_epi16(T20_0B, avg);
    T20_1B = _mm_sub_epi16(T20_1B, avg);
    T21_0A = _mm_sub_epi16(T21_0A, avg);
    T21_1A = _mm_sub_epi16(T21_1A, avg);
    T21_0B = _mm_sub_epi16(T21_0B, avg);
    T21_1B = _mm_sub_epi16(T21_1B, avg);
    T22_0A = _mm_sub_epi16(T22_0A, avg);
    T22_1A = _mm_sub_epi16(T22_1A, avg);
    T22_0B = _mm_sub_epi16(T22_0B, avg);
    T22_1B = _mm_sub_epi16(T22_1B, avg);
    T23_0A = _mm_sub_epi16(T23_0A, avg);
    T23_1A = _mm_sub_epi16(T23_1A, avg);
    T23_0B = _mm_sub_epi16(T23_0B, avg);
    T23_1B = _mm_sub_epi16(T23_1B, avg);
    T24_0A = _mm_sub_epi16(T24_0A, avg);
    T24_1A = _mm_sub_epi16(T24_1A, avg);
    T24_0B = _mm_sub_epi16(T24_0B, avg);
    T24_1B = _mm_sub_epi16(T24_1B, avg);
    T25_0A = _mm_sub_epi16(T25_0A, avg);
    T25_1A = _mm_sub_epi16(T25_1A, avg);
    T25_0B = _mm_sub_epi16(T25_0B, avg);
    T25_1B = _mm_sub_epi16(T25_1B, avg);
    T26_0A = _mm_sub_epi16(T26_0A, avg);
    T26_1A = _mm_sub_epi16(T26_1A, avg);
    T26_0B = _mm_sub_epi16(T26_0B, avg);
    T26_1B = _mm_sub_epi16(T26_1B, avg);
    T27_0A = _mm_sub_epi16(T27_0A, avg);
    T27_1A = _mm_sub_epi16(T27_1A, avg);
    T27_0B = _mm_sub_epi16(T27_0B, avg);
    T27_1B = _mm_sub_epi16(T27_1B, avg);
    T28_0A = _mm_sub_epi16(T28_0A, avg);
    T28_1A = _mm_sub_epi16(T28_1A, avg);
    T28_0B = _mm_sub_epi16(T28_0B, avg);
    T28_1B = _mm_sub_epi16(T28_1B, avg);
    T29_0A = _mm_sub_epi16(T29_0A, avg);
    T29_1A = _mm_sub_epi16(T29_1A, avg);
    T29_0B = _mm_sub_epi16(T29_0B, avg);
    T29_1B = _mm_sub_epi16(T29_1B, avg);
    T30_0A = _mm_sub_epi16(T30_0A, avg);
    T30_1A = _mm_sub_epi16(T30_1A, avg);
    T30_0B = _mm_sub_epi16(T30_0B, avg);
    T30_1B = _mm_sub_epi16(T30_1B, avg);
    T31_0A = _mm_sub_epi16(T31_0A, avg);
    T31_1A = _mm_sub_epi16(T31_1A, avg);
    T31_0B = _mm_sub_epi16(T31_0B, avg);
    T31_1B = _mm_sub_epi16(T31_1B, avg);

    T0_0A = _mm_abs_epi16(T0_0A);
    T0_1A = _mm_abs_epi16(T0_1A);
    T0_0B = _mm_abs_epi16(T0_0B);
    T0_1B = _mm_abs_epi16(T0_1B);
    T1_0A = _mm_abs_epi16(T1_0A);
    T1_1A = _mm_abs_epi16(T1_1A);
    T1_0B = _mm_abs_epi16(T1_0B);
    T1_1B = _mm_abs_epi16(T1_1B);
    T2_0A = _mm_abs_epi16(T2_0A);
    T2_1A = _mm_abs_epi16(T2_1A);
    T2_0B = _mm_abs_epi16(T2_0B);
    T2_1B = _mm_abs_epi16(T2_1B);
    T3_0A = _mm_abs_epi16(T3_0A);
    T3_1A = _mm_abs_epi16(T3_1A);
    T3_0B = _mm_abs_epi16(T3_0B);
    T3_1B = _mm_abs_epi16(T3_1B);
    T4_0A = _mm_abs_epi16(T4_0A);
    T4_1A = _mm_abs_epi16(T4_1A);
    T4_0B = _mm_abs_epi16(T4_0B);
    T4_1B = _mm_abs_epi16(T4_1B);
    T5_0A = _mm_abs_epi16(T5_0A);
    T5_1A = _mm_abs_epi16(T5_1A);
    T5_0B = _mm_abs_epi16(T5_0B);
    T5_1B = _mm_abs_epi16(T5_1B);
    T6_0A = _mm_abs_epi16(T6_0A);
    T6_1A = _mm_abs_epi16(T6_1A);
    T6_0B = _mm_abs_epi16(T6_0B);
    T6_1B = _mm_abs_epi16(T6_1B);
    T7_0A = _mm_abs_epi16(T7_0A);
    T7_1A = _mm_abs_epi16(T7_1A);
    T7_0B = _mm_abs_epi16(T7_0B);
    T7_1B = _mm_abs_epi16(T7_1B);
    T8_0A = _mm_abs_epi16(T8_0A);
    T8_1A = _mm_abs_epi16(T8_1A);
    T8_0B = _mm_abs_epi16(T8_0B);
    T8_1B = _mm_abs_epi16(T8_1B);
    T9_0A = _mm_abs_epi16(T9_0A);
    T9_1A = _mm_abs_epi16(T9_1A);
    T9_0B = _mm_abs_epi16(T9_0B);
    T9_1B = _mm_abs_epi16(T9_1B);
    T10_0A = _mm_abs_epi16(T10_0A);
    T10_1A = _mm_abs_epi16(T10_1A);
    T10_0B = _mm_abs_epi16(T10_0B);
    T10_1B = _mm_abs_epi16(T10_1B);
    T11_0A = _mm_abs_epi16(T11_0A);
    T11_1A = _mm_abs_epi16(T11_1A);
    T11_0B = _mm_abs_epi16(T11_0B);
    T11_1B = _mm_abs_epi16(T11_1B);
    T12_0A = _mm_abs_epi16(T12_0A);
    T12_1A = _mm_abs_epi16(T12_1A);
    T12_0B = _mm_abs_epi16(T12_0B);
    T12_1B = _mm_abs_epi16(T12_1B);
    T13_0A = _mm_abs_epi16(T13_0A);
    T13_1A = _mm_abs_epi16(T13_1A);
    T13_0B = _mm_abs_epi16(T13_0B);
    T13_1B = _mm_abs_epi16(T13_1B);
    T14_0A = _mm_abs_epi16(T14_0A);
    T14_1A = _mm_abs_epi16(T14_1A);
    T14_0B = _mm_abs_epi16(T14_0B);
    T14_1B = _mm_abs_epi16(T14_1B);
    T15_0A = _mm_abs_epi16(T15_0A);
    T15_1A = _mm_abs_epi16(T15_1A);
    T15_0B = _mm_abs_epi16(T15_0B);
    T15_1B = _mm_abs_epi16(T15_1B);
    T16_0A = _mm_abs_epi16(T16_0A);
    T16_1A = _mm_abs_epi16(T16_1A);
    T16_0B = _mm_abs_epi16(T16_0B);
    T16_1B = _mm_abs_epi16(T16_1B);
    T17_0A = _mm_abs_epi16(T17_0A);
    T17_1A = _mm_abs_epi16(T17_1A);
    T17_0B = _mm_abs_epi16(T17_0B);
    T17_1B = _mm_abs_epi16(T17_1B);
    T18_0A = _mm_abs_epi16(T18_0A);
    T18_1A = _mm_abs_epi16(T18_1A);
    T18_0B = _mm_abs_epi16(T18_0B);
    T18_1B = _mm_abs_epi16(T18_1B);
    T19_0A = _mm_abs_epi16(T19_0A);
    T19_1A = _mm_abs_epi16(T19_1A);
    T19_0B = _mm_abs_epi16(T19_0B);
    T19_1B = _mm_abs_epi16(T19_1B);
    T20_0A = _mm_abs_epi16(T20_0A);
    T20_1A = _mm_abs_epi16(T20_1A);
    T20_0B = _mm_abs_epi16(T20_0B);
    T20_1B = _mm_abs_epi16(T20_1B);
    T21_0A = _mm_abs_epi16(T21_0A);
    T21_1A = _mm_abs_epi16(T21_1A);
    T21_0B = _mm_abs_epi16(T21_0B);
    T21_1B = _mm_abs_epi16(T21_1B);
    T22_0A = _mm_abs_epi16(T22_0A);
    T22_1A = _mm_abs_epi16(T22_1A);
    T22_0B = _mm_abs_epi16(T22_0B);
    T22_1B = _mm_abs_epi16(T22_1B);
    T23_0A = _mm_abs_epi16(T23_0A);
    T23_1A = _mm_abs_epi16(T23_1A);
    T23_0B = _mm_abs_epi16(T23_0B);
    T23_1B = _mm_abs_epi16(T23_1B);
    T24_0A = _mm_abs_epi16(T24_0A);
    T24_1A = _mm_abs_epi16(T24_1A);
    T24_0B = _mm_abs_epi16(T24_0B);
    T24_1B = _mm_abs_epi16(T24_1B);
    T25_0A = _mm_abs_epi16(T25_0A);
    T25_1A = _mm_abs_epi16(T25_1A);
    T25_0B = _mm_abs_epi16(T25_0B);
    T25_1B = _mm_abs_epi16(T25_1B);
    T26_0A = _mm_abs_epi16(T26_0A);
    T26_1A = _mm_abs_epi16(T26_1A);
    T26_0B = _mm_abs_epi16(T26_0B);
    T26_1B = _mm_abs_epi16(T26_1B);
    T27_0A = _mm_abs_epi16(T27_0A);
    T27_1A = _mm_abs_epi16(T27_1A);
    T27_0B = _mm_abs_epi16(T27_0B);
    T27_1B = _mm_abs_epi16(T27_1B);
    T28_0A = _mm_abs_epi16(T28_0A);
    T28_1A = _mm_abs_epi16(T28_1A);
    T28_0B = _mm_abs_epi16(T28_0B);
    T28_1B = _mm_abs_epi16(T28_1B);
    T29_0A = _mm_abs_epi16(T29_0A);
    T29_1A = _mm_abs_epi16(T29_1A);
    T29_0B = _mm_abs_epi16(T29_0B);
    T29_1B = _mm_abs_epi16(T29_1B);
    T30_0A = _mm_abs_epi16(T30_0A);
    T30_1A = _mm_abs_epi16(T30_1A);
    T30_0B = _mm_abs_epi16(T30_0B);
    T30_1B = _mm_abs_epi16(T30_1B);
    T31_0A = _mm_abs_epi16(T31_0A);
    T31_1A = _mm_abs_epi16(T31_1A);
    T31_0B = _mm_abs_epi16(T31_0B);
    T31_1B = _mm_abs_epi16(T31_1B);

    T0 = _mm_add_epi16(T0_0A, T0_1A);
    T0 = _mm_add_epi16(T0, T0_0B);
    T0 = _mm_add_epi16(T0, T0_1B);
    T1 = _mm_add_epi16(T1_0A, T1_1A);
    T1 = _mm_add_epi16(T1, T1_0B);
    T1 = _mm_add_epi16(T1, T1_1B);
    T2 = _mm_add_epi16(T2_0A, T2_1A);
    T2 = _mm_add_epi16(T2, T2_0B);
    T2 = _mm_add_epi16(T2, T2_1B);
    T3 = _mm_add_epi16(T3_0A, T3_1A);
    T3 = _mm_add_epi16(T3, T3_0B);
    T3 = _mm_add_epi16(T3, T3_1B);
    T4 = _mm_add_epi16(T4_0A, T4_1A);
    T4 = _mm_add_epi16(T4, T4_0B);
    T4 = _mm_add_epi16(T4, T4_1B);
    T5 = _mm_add_epi16(T5_0A, T5_1A);
    T5 = _mm_add_epi16(T5, T5_0B);
    T5 = _mm_add_epi16(T5, T5_1B);
    T6 = _mm_add_epi16(T6_0A, T6_1A);
    T6 = _mm_add_epi16(T6, T6_0B);
    T6 = _mm_add_epi16(T6, T6_1B);
    T7 = _mm_add_epi16(T7_0A, T7_1A);
    T7 = _mm_add_epi16(T7, T7_0B);
    T7 = _mm_add_epi16(T7, T7_1B);
    T8 = _mm_add_epi16(T8_0A, T8_1A);
    T8 = _mm_add_epi16(T8, T8_0B);
    T8 = _mm_add_epi16(T8, T8_1B);
    T9 = _mm_add_epi16(T9_0A, T9_1A);
    T9 = _mm_add_epi16(T9, T9_0B);
    T9 = _mm_add_epi16(T9, T9_1B);
    T10 = _mm_add_epi16(T10_0A, T10_1A);
    T10 = _mm_add_epi16(T10, T10_0B);
    T10 = _mm_add_epi16(T10, T10_1B);
    T11 = _mm_add_epi16(T11_0A, T11_1A);
    T11 = _mm_add_epi16(T11, T11_0B);
    T11 = _mm_add_epi16(T11, T11_1B);
    T12 = _mm_add_epi16(T12_0A, T12_1A);
    T12 = _mm_add_epi16(T12, T12_0B);
    T12 = _mm_add_epi16(T12, T12_1B);
    T13 = _mm_add_epi16(T13_0A, T13_1A);
    T13 = _mm_add_epi16(T13, T13_0B);
    T13 = _mm_add_epi16(T13, T13_1B);
    T14 = _mm_add_epi16(T14_0A, T14_1A);
    T14 = _mm_add_epi16(T14, T14_0B);
    T14 = _mm_add_epi16(T14, T14_1B);
    T15 = _mm_add_epi16(T15_0A, T15_1A);
    T15 = _mm_add_epi16(T15, T15_0B);
    T15 = _mm_add_epi16(T15, T15_1B);
    T16 = _mm_add_epi16(T16_0A, T16_1A);
    T16 = _mm_add_epi16(T16, T16_0B);
    T16 = _mm_add_epi16(T16, T16_1B);
    T17 = _mm_add_epi16(T17_0A, T17_1A);
    T17 = _mm_add_epi16(T17, T17_0B);
    T17 = _mm_add_epi16(T17, T17_1B);
    T18 = _mm_add_epi16(T18_0A, T18_1A);
    T18 = _mm_add_epi16(T18, T18_0B);
    T18 = _mm_add_epi16(T18, T18_1B);
    T19 = _mm_add_epi16(T19_0A, T19_1A);
    T19 = _mm_add_epi16(T19, T19_0B);
    T19 = _mm_add_epi16(T19, T19_1B);
    T20 = _mm_add_epi16(T20_0A, T20_1A);
    T20 = _mm_add_epi16(T20, T20_0B);
    T20 = _mm_add_epi16(T20, T20_1B);
    T21 = _mm_add_epi16(T21_0A, T21_1A);
    T21 = _mm_add_epi16(T21, T21_0B);
    T21 = _mm_add_epi16(T21, T21_1B);
    T22 = _mm_add_epi16(T22_0A, T22_1A);
    T22 = _mm_add_epi16(T22, T22_0B);
    T22 = _mm_add_epi16(T22, T22_1B);
    T23 = _mm_add_epi16(T23_0A, T23_1A);
    T23 = _mm_add_epi16(T23, T23_0B);
    T23 = _mm_add_epi16(T23, T23_1B);
    T24 = _mm_add_epi16(T24_0A, T24_1A);
    T24 = _mm_add_epi16(T24, T24_0B);
    T24 = _mm_add_epi16(T24, T24_1B);
    T25 = _mm_add_epi16(T25_0A, T25_1A);
    T25 = _mm_add_epi16(T25, T25_0B);
    T25 = _mm_add_epi16(T25, T25_1B);
    T26 = _mm_add_epi16(T26_0A, T26_1A);
    T26 = _mm_add_epi16(T26, T26_0B);
    T26 = _mm_add_epi16(T26, T26_1B);
    T27 = _mm_add_epi16(T27_0A, T27_1A);
    T27 = _mm_add_epi16(T27, T27_0B);
    T27 = _mm_add_epi16(T27, T27_1B);
    T28 = _mm_add_epi16(T28_0A, T28_1A);
    T28 = _mm_add_epi16(T28, T28_0B);
    T28 = _mm_add_epi16(T28, T28_1B);
    T29 = _mm_add_epi16(T29_0A, T29_1A);
    T29 = _mm_add_epi16(T29, T29_0B);
    T29 = _mm_add_epi16(T29, T29_1B);
    T30 = _mm_add_epi16(T30_0A, T30_1A);
    T30 = _mm_add_epi16(T30, T30_0B);
    T30 = _mm_add_epi16(T30, T30_1B);
    T31 = _mm_add_epi16(T31_0A, T31_1A);
    T31 = _mm_add_epi16(T31, T31_0B);
    T31 = _mm_add_epi16(T31, T31_1B);

    M = _mm_add_epi16(T0, T1);
    M = _mm_add_epi16(M, T2);
    M = _mm_add_epi16(M, T3);
    M = _mm_add_epi16(M, T4);
    M = _mm_add_epi16(M, T5);
    M = _mm_add_epi16(M, T6);
    M = _mm_add_epi16(M, T7);
    M = _mm_add_epi16(M, T8);
    M = _mm_add_epi16(M, T9);
    M = _mm_add_epi16(M, T10);
    M = _mm_add_epi16(M, T11);
    M = _mm_add_epi16(M, T12);
    M = _mm_add_epi16(M, T13);
    M = _mm_add_epi16(M, T14);
    M = _mm_add_epi16(M, T15);
    M = _mm_add_epi16(M, T16);
    M = _mm_add_epi16(M, T17);
    M = _mm_add_epi16(M, T18);
    M = _mm_add_epi16(M, T19);
    M = _mm_add_epi16(M, T20);
    M = _mm_add_epi16(M, T21);
    M = _mm_add_epi16(M, T22);
    M = _mm_add_epi16(M, T23);
    M = _mm_add_epi16(M, T24);
    M = _mm_add_epi16(M, T25);
    M = _mm_add_epi16(M, T26);
    M = _mm_add_epi16(M, T27);
    M = _mm_add_epi16(M, T28);
    M = _mm_add_epi16(M, T29);
    M = _mm_add_epi16(M, T30);
    M = _mm_add_epi16(M, T31);

    mad = M128_I16(M, 0) + M128_I16(M, 1) + M128_I16(M, 2) + M128_I16(M, 3) + M128_I16(M, 4) + M128_I16(M, 5) + M128_I16(M, 6) + M128_I16(M, 7);

    return mad;
}

/* ---------------------------------------------------------------------------
*/
int mad_64x64_sse128(pel_t *p_src, int i_src, int cu_size)
{
    __m128i zero;
    __m128i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50, T51, T52, T53, T54, T55, T56, T57, T58, T59, T60, T61, T62, T63;
    __m128i T0A, T1A, T2A, T3A, T4A, T5A, T6A, T7A, T8A, T9A, T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A, T18A, T19A, T20A, T21A, T22A, T23A, T24A, T25A, T26A, T27A, T28A, T29A, T30A, T31A, T32A, T33A, T34A, T35A, T36A, T37A, T38A, T39A, T40A, T41A, T42A, T43A, T44A, T45A, T46A, T47A, T48A, T49A, T50A, T51A, T52A, T53A, T54A, T55A, T56A, T57A, T58A, T59A, T60A, T61A, T62A, T63A;
    __m128i T0B, T1B, T2B, T3B, T4B, T5B, T6B, T7B, T8B, T9B, T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B, T18B, T19B, T20B, T21B, T22B, T23B, T24B, T25B, T26B, T27B, T28B, T29B, T30B, T31B, T32B, T33B, T34B, T35B, T36B, T37B, T38B, T39B, T40B, T41B, T42B, T43B, T44B, T45B, T46B, T47B, T48B, T49B, T50B, T51B, T52B, T53B, T54B, T55B, T56B, T57B, T58B, T59B, T60B, T61B, T62B, T63B;
    __m128i T0C, T1C, T2C, T3C, T4C, T5C, T6C, T7C, T8C, T9C, T10C, T11C, T12C, T13C, T14C, T15C, T16C, T17C, T18C, T19C, T20C, T21C, T22C, T23C, T24C, T25C, T26C, T27C, T28C, T29C, T30C, T31C, T32C, T33C, T34C, T35C, T36C, T37C, T38C, T39C, T40C, T41C, T42C, T43C, T44C, T45C, T46C, T47C, T48C, T49C, T50C, T51C, T52C, T53C, T54C, T55C, T56C, T57C, T58C, T59C, T60C, T61C, T62C, T63C;
    __m128i T0D, T1D, T2D, T3D, T4D, T5D, T6D, T7D, T8D, T9D, T10D, T11D, T12D, T13D, T14D, T15D, T16D, T17D, T18D, T19D, T20D, T21D, T22D, T23D, T24D, T25D, T26D, T27D, T28D, T29D, T30D, T31D, T32D, T33D, T34D, T35D, T36D, T37D, T38D, T39D, T40D, T41D, T42D, T43D, T44D, T45D, T46D, T47D, T48D, T49D, T50D, T51D, T52D, T53D, T54D, T55D, T56D, T57D, T58D, T59D, T60D, T61D, T62D, T63D;
    __m128i T0_0A, T1_0A, T2_0A, T3_0A, T4_0A, T5_0A, T6_0A, T7_0A, T8_0A, T9_0A, T10_0A, T11_0A, T12_0A, T13_0A, T14_0A, T15_0A, T16_0A, T17_0A, T18_0A, T19_0A, T20_0A, T21_0A, T22_0A, T23_0A, T24_0A, T25_0A, T26_0A, T27_0A, T28_0A, T29_0A, T30_0A, T31_0A, T32_0A, T33_0A, T34_0A, T35_0A, T36_0A, T37_0A, T38_0A, T39_0A, T40_0A, T41_0A, T42_0A, T43_0A, T44_0A, T45_0A, T46_0A, T47_0A, T48_0A, T49_0A, T50_0A, T51_0A, T52_0A, T53_0A, T54_0A, T55_0A, T56_0A, T57_0A, T58_0A, T59_0A, T60_0A, T61_0A, T62_0A, T63_0A;
    __m128i T0_1A, T1_1A, T2_1A, T3_1A, T4_1A, T5_1A, T6_1A, T7_1A, T8_1A, T9_1A, T10_1A, T11_1A, T12_1A, T13_1A, T14_1A, T15_1A, T16_1A, T17_1A, T18_1A, T19_1A, T20_1A, T21_1A, T22_1A, T23_1A, T24_1A, T25_1A, T26_1A, T27_1A, T28_1A, T29_1A, T30_1A, T31_1A, T32_1A, T33_1A, T34_1A, T35_1A, T36_1A, T37_1A, T38_1A, T39_1A, T40_1A, T41_1A, T42_1A, T43_1A, T44_1A, T45_1A, T46_1A, T47_1A, T48_1A, T49_1A, T50_1A, T51_1A, T52_1A, T53_1A, T54_1A, T55_1A, T56_1A, T57_1A, T58_1A, T59_1A, T60_1A, T61_1A, T62_1A, T63_1A;
    __m128i T0_0B, T1_0B, T2_0B, T3_0B, T4_0B, T5_0B, T6_0B, T7_0B, T8_0B, T9_0B, T10_0B, T11_0B, T12_0B, T13_0B, T14_0B, T15_0B, T16_0B, T17_0B, T18_0B, T19_0B, T20_0B, T21_0B, T22_0B, T23_0B, T24_0B, T25_0B, T26_0B, T27_0B, T28_0B, T29_0B, T30_0B, T31_0B, T32_0B, T33_0B, T34_0B, T35_0B, T36_0B, T37_0B, T38_0B, T39_0B, T40_0B, T41_0B, T42_0B, T43_0B, T44_0B, T45_0B, T46_0B, T47_0B, T48_0B, T49_0B, T50_0B, T51_0B, T52_0B, T53_0B, T54_0B, T55_0B, T56_0B, T57_0B, T58_0B, T59_0B, T60_0B, T61_0B, T62_0B, T63_0B;
    __m128i T0_1B, T1_1B, T2_1B, T3_1B, T4_1B, T5_1B, T6_1B, T7_1B, T8_1B, T9_1B, T10_1B, T11_1B, T12_1B, T13_1B, T14_1B, T15_1B, T16_1B, T17_1B, T18_1B, T19_1B, T20_1B, T21_1B, T22_1B, T23_1B, T24_1B, T25_1B, T26_1B, T27_1B, T28_1B, T29_1B, T30_1B, T31_1B, T32_1B, T33_1B, T34_1B, T35_1B, T36_1B, T37_1B, T38_1B, T39_1B, T40_1B, T41_1B, T42_1B, T43_1B, T44_1B, T45_1B, T46_1B, T47_1B, T48_1B, T49_1B, T50_1B, T51_1B, T52_1B, T53_1B, T54_1B, T55_1B, T56_1B, T57_1B, T58_1B, T59_1B, T60_1B, T61_1B, T62_1B, T63_1B;
    __m128i T0_0C, T1_0C, T2_0C, T3_0C, T4_0C, T5_0C, T6_0C, T7_0C, T8_0C, T9_0C, T10_0C, T11_0C, T12_0C, T13_0C, T14_0C, T15_0C, T16_0C, T17_0C, T18_0C, T19_0C, T20_0C, T21_0C, T22_0C, T23_0C, T24_0C, T25_0C, T26_0C, T27_0C, T28_0C, T29_0C, T30_0C, T31_0C, T32_0C, T33_0C, T34_0C, T35_0C, T36_0C, T37_0C, T38_0C, T39_0C, T40_0C, T41_0C, T42_0C, T43_0C, T44_0C, T45_0C, T46_0C, T47_0C, T48_0C, T49_0C, T50_0C, T51_0C, T52_0C, T53_0C, T54_0C, T55_0C, T56_0C, T57_0C, T58_0C, T59_0C, T60_0C, T61_0C, T62_0C, T63_0C;
    __m128i T0_1C, T1_1C, T2_1C, T3_1C, T4_1C, T5_1C, T6_1C, T7_1C, T8_1C, T9_1C, T10_1C, T11_1C, T12_1C, T13_1C, T14_1C, T15_1C, T16_1C, T17_1C, T18_1C, T19_1C, T20_1C, T21_1C, T22_1C, T23_1C, T24_1C, T25_1C, T26_1C, T27_1C, T28_1C, T29_1C, T30_1C, T31_1C, T32_1C, T33_1C, T34_1C, T35_1C, T36_1C, T37_1C, T38_1C, T39_1C, T40_1C, T41_1C, T42_1C, T43_1C, T44_1C, T45_1C, T46_1C, T47_1C, T48_1C, T49_1C, T50_1C, T51_1C, T52_1C, T53_1C, T54_1C, T55_1C, T56_1C, T57_1C, T58_1C, T59_1C, T60_1C, T61_1C, T62_1C, T63_1C;
    __m128i T0_0D, T1_0D, T2_0D, T3_0D, T4_0D, T5_0D, T6_0D, T7_0D, T8_0D, T9_0D, T10_0D, T11_0D, T12_0D, T13_0D, T14_0D, T15_0D, T16_0D, T17_0D, T18_0D, T19_0D, T20_0D, T21_0D, T22_0D, T23_0D, T24_0D, T25_0D, T26_0D, T27_0D, T28_0D, T29_0D, T30_0D, T31_0D, T32_0D, T33_0D, T34_0D, T35_0D, T36_0D, T37_0D, T38_0D, T39_0D, T40_0D, T41_0D, T42_0D, T43_0D, T44_0D, T45_0D, T46_0D, T47_0D, T48_0D, T49_0D, T50_0D, T51_0D, T52_0D, T53_0D, T54_0D, T55_0D, T56_0D, T57_0D, T58_0D, T59_0D, T60_0D, T61_0D, T62_0D, T63_0D;
    __m128i T0_1D, T1_1D, T2_1D, T3_1D, T4_1D, T5_1D, T6_1D, T7_1D, T8_1D, T9_1D, T10_1D, T11_1D, T12_1D, T13_1D, T14_1D, T15_1D, T16_1D, T17_1D, T18_1D, T19_1D, T20_1D, T21_1D, T22_1D, T23_1D, T24_1D, T25_1D, T26_1D, T27_1D, T28_1D, T29_1D, T30_1D, T31_1D, T32_1D, T33_1D, T34_1D, T35_1D, T36_1D, T37_1D, T38_1D, T39_1D, T40_1D, T41_1D, T42_1D, T43_1D, T44_1D, T45_1D, T46_1D, T47_1D, T48_1D, T49_1D, T50_1D, T51_1D, T52_1D, T53_1D, T54_1D, T55_1D, T56_1D, T57_1D, T58_1D, T59_1D, T60_1D, T61_1D, T62_1D, T63_1D;
    __m128i S1, S2, S3, S;
    __m128i avg;
    __m128i M1, M2, M;
    int sum1, sum2, sum3;
    int mad1, mad2, mads;
    int num_pix = cu_size * cu_size;
    int sum = 0;
    int f_avg = 0;                 /* average of all pixels in current block */
    int mad = 0;

    /* cal average */
    /*for (int y = 0; y < cu_size; ++y) {
    int sum_row = 0;
    for (int x = 0; x < cu_size; ++x) {
    sum_row += p_src[x];
    }
    sum += sum_row;
    p_src += i_src;
    }
    f_avg = sum / num_pix;*/
    zero = _mm_set1_epi8(0);
    T0A = _mm_loadu_si128((__m128i *)p_src);
    T0_0A = _mm_unpacklo_epi8(T0A, zero);
    T0_1A = _mm_unpackhi_epi8(T0A, zero);
    T0A = _mm_add_epi16(T0_0A, T0_1A);
    T0B = _mm_loadu_si128((__m128i *)(p_src + 16));
    T0_0B = _mm_unpacklo_epi8(T0B, zero);
    T0_1B = _mm_unpackhi_epi8(T0B, zero);
    T0B = _mm_add_epi16(T0_0B, T0_1B);
    T0C = _mm_loadu_si128((__m128i *)(p_src + 32));
    T0_0C = _mm_unpacklo_epi8(T0C, zero);
    T0_1C = _mm_unpackhi_epi8(T0C, zero);
    T0C = _mm_add_epi16(T0_0C, T0_1C);
    T0D = _mm_loadu_si128((__m128i *)(p_src + 48));
    T0_0D = _mm_unpacklo_epi8(T0D, zero);
    T0_1D = _mm_unpackhi_epi8(T0D, zero);
    T0D = _mm_add_epi16(T0_0D, T0_1D);
    T0 = _mm_add_epi16(T0A, T0B);
    T0 = _mm_add_epi16(T0, T0C);
    T0 = _mm_add_epi16(T0, T0D);

    T1A = _mm_loadu_si128((__m128i *)(p_src + i_src));
    T1_0A = _mm_unpacklo_epi8(T1A, zero);
    T1_1A = _mm_unpackhi_epi8(T1A, zero);
    T1A = _mm_add_epi16(T1_0A, T1_1A);
    T1B = _mm_loadu_si128((__m128i *)(p_src + i_src + 16));
    T1_0B = _mm_unpacklo_epi8(T1B, zero);
    T1_1B = _mm_unpackhi_epi8(T1B, zero);
    T1B = _mm_add_epi16(T1_0B, T1_1B);
    T1C = _mm_loadu_si128((__m128i *)(p_src + i_src + 32));
    T1_0C = _mm_unpacklo_epi8(T1C, zero);
    T1_1C = _mm_unpackhi_epi8(T1C, zero);
    T1C = _mm_add_epi16(T1_0C, T1_1C);
    T1D = _mm_loadu_si128((__m128i *)(p_src + i_src + 48));
    T1_0D = _mm_unpacklo_epi8(T1D, zero);
    T1_1D = _mm_unpackhi_epi8(T1D, zero);
    T1D = _mm_add_epi16(T1_0D, T1_1D);
    T1 = _mm_add_epi16(T1A, T1B);
    T1 = _mm_add_epi16(T1, T1C);
    T1 = _mm_add_epi16(T1, T1D);

    T2A = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src));
    T2_0A = _mm_unpacklo_epi8(T2A, zero);
    T2_1A = _mm_unpackhi_epi8(T2A, zero);
    T2A = _mm_add_epi16(T2_0A, T2_1A);
    T2B = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src + 16));
    T2_0B = _mm_unpacklo_epi8(T2B, zero);
    T2_1B = _mm_unpackhi_epi8(T2B, zero);
    T2B = _mm_add_epi16(T2_0B, T2_1B);
    T2C = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src + 32));
    T2_0C = _mm_unpacklo_epi8(T2C, zero);
    T2_1C = _mm_unpackhi_epi8(T2C, zero);
    T2C = _mm_add_epi16(T2_0C, T2_1C);
    T2D = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src + 48));
    T2_0D = _mm_unpacklo_epi8(T2D, zero);
    T2_1D = _mm_unpackhi_epi8(T2D, zero);
    T2D = _mm_add_epi16(T2_0D, T2_1D);
    T2 = _mm_add_epi16(T2A, T2B);
    T2 = _mm_add_epi16(T2, T2C);
    T2 = _mm_add_epi16(T2, T2D);

    T3A = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src));
    T3_0A = _mm_unpacklo_epi8(T3A, zero);
    T3_1A = _mm_unpackhi_epi8(T3A, zero);
    T3A = _mm_add_epi16(T3_0A, T3_1A);
    T3B = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src + 16));
    T3_0B = _mm_unpacklo_epi8(T3B, zero);
    T3_1B = _mm_unpackhi_epi8(T3B, zero);
    T3B = _mm_add_epi16(T3_0B, T3_1B);
    T3C = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src + 32));
    T3_0C = _mm_unpacklo_epi8(T3C, zero);
    T3_1C = _mm_unpackhi_epi8(T3C, zero);
    T3C = _mm_add_epi16(T3_0C, T3_1C);
    T3D = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src + 48));
    T3_0D = _mm_unpacklo_epi8(T3D, zero);
    T3_1D = _mm_unpackhi_epi8(T3D, zero);
    T3D = _mm_add_epi16(T3_0D, T3_1D);
    T3 = _mm_add_epi16(T3A, T3B);
    T3 = _mm_add_epi16(T3, T3C);
    T3 = _mm_add_epi16(T3, T3D);

    T4A = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src));
    T4_0A = _mm_unpacklo_epi8(T4A, zero);
    T4_1A = _mm_unpackhi_epi8(T4A, zero);
    T4A = _mm_add_epi16(T4_0A, T4_1A);
    T4B = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src + 16));
    T4_0B = _mm_unpacklo_epi8(T4B, zero);
    T4_1B = _mm_unpackhi_epi8(T4B, zero);
    T4B = _mm_add_epi16(T4_0B, T4_1B);
    T4C = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src + 32));
    T4_0C = _mm_unpacklo_epi8(T4C, zero);
    T4_1C = _mm_unpackhi_epi8(T4C, zero);
    T4C = _mm_add_epi16(T4_0C, T4_1C);
    T4D = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src + 48));
    T4_0D = _mm_unpacklo_epi8(T4D, zero);
    T4_1D = _mm_unpackhi_epi8(T4D, zero);
    T4D = _mm_add_epi16(T4_0D, T4_1D);
    T4 = _mm_add_epi16(T4A, T4B);
    T4 = _mm_add_epi16(T4, T4C);
    T4 = _mm_add_epi16(T4, T4D);

    T5A = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src));
    T5_0A = _mm_unpacklo_epi8(T5A, zero);
    T5_1A = _mm_unpackhi_epi8(T5A, zero);
    T5A = _mm_add_epi16(T5_0A, T5_1A);
    T5B = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src + 16));
    T5_0B = _mm_unpacklo_epi8(T5B, zero);
    T5_1B = _mm_unpackhi_epi8(T5B, zero);
    T5B = _mm_add_epi16(T5_0B, T5_1B);
    T5C = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src + 32));
    T5_0C = _mm_unpacklo_epi8(T5C, zero);
    T5_1C = _mm_unpackhi_epi8(T5C, zero);
    T5C = _mm_add_epi16(T5_0C, T5_1C);
    T5D = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src + 48));
    T5_0D = _mm_unpacklo_epi8(T5D, zero);
    T5_1D = _mm_unpackhi_epi8(T5D, zero);
    T5D = _mm_add_epi16(T5_0D, T5_1D);
    T5 = _mm_add_epi16(T5A, T5B);
    T5 = _mm_add_epi16(T5, T5C);
    T5 = _mm_add_epi16(T5, T5D);

    T6A = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src));
    T6_0A = _mm_unpacklo_epi8(T6A, zero);
    T6_1A = _mm_unpackhi_epi8(T6A, zero);
    T6A = _mm_add_epi16(T6_0A, T6_1A);
    T6B = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src + 16));
    T6_0B = _mm_unpacklo_epi8(T6B, zero);
    T6_1B = _mm_unpackhi_epi8(T6B, zero);
    T6B = _mm_add_epi16(T6_0B, T6_1B);
    T6C = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src + 32));
    T6_0C = _mm_unpacklo_epi8(T6C, zero);
    T6_1C = _mm_unpackhi_epi8(T6C, zero);
    T6C = _mm_add_epi16(T6_0C, T6_1C);
    T6D = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src + 48));
    T6_0D = _mm_unpacklo_epi8(T6D, zero);
    T6_1D = _mm_unpackhi_epi8(T6D, zero);
    T6D = _mm_add_epi16(T6_0D, T6_1D);
    T6 = _mm_add_epi16(T6A, T6B);
    T6 = _mm_add_epi16(T6, T6C);
    T6 = _mm_add_epi16(T6, T6D);

    T7A = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src));
    T7_0A = _mm_unpacklo_epi8(T7A, zero);
    T7_1A = _mm_unpackhi_epi8(T7A, zero);
    T7A = _mm_add_epi16(T7_0A, T7_1A);
    T7B = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src + 16));
    T7_0B = _mm_unpacklo_epi8(T7B, zero);
    T7_1B = _mm_unpackhi_epi8(T7B, zero);
    T7B = _mm_add_epi16(T7_0B, T7_1B);
    T7C = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src + 32));
    T7_0C = _mm_unpacklo_epi8(T7C, zero);
    T7_1C = _mm_unpackhi_epi8(T7C, zero);
    T7C = _mm_add_epi16(T7_0C, T7_1C);
    T7D = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src + 48));
    T7_0D = _mm_unpacklo_epi8(T7D, zero);
    T7_1D = _mm_unpackhi_epi8(T7D, zero);
    T7D = _mm_add_epi16(T7_0D, T7_1D);
    T7 = _mm_add_epi16(T7A, T7B);
    T7 = _mm_add_epi16(T7, T7C);
    T7 = _mm_add_epi16(T7, T7D);

    T8A = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src));
    T8_0A = _mm_unpacklo_epi8(T8A, zero);
    T8_1A = _mm_unpackhi_epi8(T8A, zero);
    T8A = _mm_add_epi16(T8_0A, T8_1A);
    T8B = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src + 16));
    T8_0B = _mm_unpacklo_epi8(T8B, zero);
    T8_1B = _mm_unpackhi_epi8(T8B, zero);
    T8B = _mm_add_epi16(T8_0B, T8_1B);
    T8C = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src + 32));
    T8_0C = _mm_unpacklo_epi8(T8C, zero);
    T8_1C = _mm_unpackhi_epi8(T8C, zero);
    T8C = _mm_add_epi16(T8_0C, T8_1C);
    T8D = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src + 48));
    T8_0D = _mm_unpacklo_epi8(T8D, zero);
    T8_1D = _mm_unpackhi_epi8(T8D, zero);
    T8D = _mm_add_epi16(T8_0D, T8_1D);
    T8 = _mm_add_epi16(T8A, T8B);
    T8 = _mm_add_epi16(T8, T8C);
    T8 = _mm_add_epi16(T8, T8D);

    T9A = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src));
    T9_0A = _mm_unpacklo_epi8(T9A, zero);
    T9_1A = _mm_unpackhi_epi8(T9A, zero);
    T9A = _mm_add_epi16(T9_0A, T9_1A);
    T9B = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src + 16));
    T9_0B = _mm_unpacklo_epi8(T9B, zero);
    T9_1B = _mm_unpackhi_epi8(T9B, zero);
    T9B = _mm_add_epi16(T9_0B, T9_1B);
    T9C = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src + 32));
    T9_0C = _mm_unpacklo_epi8(T9C, zero);
    T9_1C = _mm_unpackhi_epi8(T9C, zero);
    T9C = _mm_add_epi16(T9_0C, T9_1C);
    T9D = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src + 48));
    T9_0D = _mm_unpacklo_epi8(T9D, zero);
    T9_1D = _mm_unpackhi_epi8(T9D, zero);
    T9D = _mm_add_epi16(T9_0D, T9_1D);
    T9 = _mm_add_epi16(T9A, T9B);
    T9 = _mm_add_epi16(T9, T9C);
    T9 = _mm_add_epi16(T9, T9D);

    T10A = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src));
    T10_0A = _mm_unpacklo_epi8(T10A, zero);
    T10_1A = _mm_unpackhi_epi8(T10A, zero);
    T10A = _mm_add_epi16(T10_0A, T10_1A);
    T10B = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src + 16));
    T10_0B = _mm_unpacklo_epi8(T10B, zero);
    T10_1B = _mm_unpackhi_epi8(T10B, zero);
    T10B = _mm_add_epi16(T10_0B, T10_1B);
    T10C = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src + 32));
    T10_0C = _mm_unpacklo_epi8(T10C, zero);
    T10_1C = _mm_unpackhi_epi8(T10C, zero);
    T10C = _mm_add_epi16(T10_0C, T10_1C);
    T10D = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src + 48));
    T10_0D = _mm_unpacklo_epi8(T10D, zero);
    T10_1D = _mm_unpackhi_epi8(T10D, zero);
    T10D = _mm_add_epi16(T10_0D, T10_1D);
    T10 = _mm_add_epi16(T10A, T10B);
    T10 = _mm_add_epi16(T10, T10C);
    T10 = _mm_add_epi16(T10, T10D);

    T11A = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src));
    T11_0A = _mm_unpacklo_epi8(T11A, zero);
    T11_1A = _mm_unpackhi_epi8(T11A, zero);
    T11A = _mm_add_epi16(T11_0A, T11_1A);
    T11B = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src + 16));
    T11_0B = _mm_unpacklo_epi8(T11B, zero);
    T11_1B = _mm_unpackhi_epi8(T11B, zero);
    T11B = _mm_add_epi16(T11_0B, T11_1B);
    T11C = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src + 32));
    T11_0C = _mm_unpacklo_epi8(T11C, zero);
    T11_1C = _mm_unpackhi_epi8(T11C, zero);
    T11C = _mm_add_epi16(T11_0C, T11_1C);
    T11D = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src + 48));
    T11_0D = _mm_unpacklo_epi8(T11D, zero);
    T11_1D = _mm_unpackhi_epi8(T11D, zero);
    T11D = _mm_add_epi16(T11_0D, T11_1D);
    T11 = _mm_add_epi16(T11A, T11B);
    T11 = _mm_add_epi16(T11, T11C);
    T11 = _mm_add_epi16(T11, T11D);

    T12A = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src));
    T12_0A = _mm_unpacklo_epi8(T12A, zero);
    T12_1A = _mm_unpackhi_epi8(T12A, zero);
    T12A = _mm_add_epi16(T12_0A, T12_1A);
    T12B = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src + 16));
    T12_0B = _mm_unpacklo_epi8(T12B, zero);
    T12_1B = _mm_unpackhi_epi8(T12B, zero);
    T12B = _mm_add_epi16(T12_0B, T12_1B);
    T12C = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src + 32));
    T12_0C = _mm_unpacklo_epi8(T12C, zero);
    T12_1C = _mm_unpackhi_epi8(T12C, zero);
    T12C = _mm_add_epi16(T12_0C, T12_1C);
    T12D = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src + 48));
    T12_0D = _mm_unpacklo_epi8(T12D, zero);
    T12_1D = _mm_unpackhi_epi8(T12D, zero);
    T12D = _mm_add_epi16(T12_0D, T12_1D);
    T12 = _mm_add_epi16(T12A, T12B);
    T12 = _mm_add_epi16(T12, T12C);
    T12 = _mm_add_epi16(T12, T12D);

    T13A = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src));
    T13_0A = _mm_unpacklo_epi8(T13A, zero);
    T13_1A = _mm_unpackhi_epi8(T13A, zero);
    T13A = _mm_add_epi16(T13_0A, T13_1A);
    T13B = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src + 16));
    T13_0B = _mm_unpacklo_epi8(T13B, zero);
    T13_1B = _mm_unpackhi_epi8(T13B, zero);
    T13B = _mm_add_epi16(T13_0B, T13_1B);
    T13C = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src + 32));
    T13_0C = _mm_unpacklo_epi8(T13C, zero);
    T13_1C = _mm_unpackhi_epi8(T13C, zero);
    T13C = _mm_add_epi16(T13_0C, T13_1C);
    T13D = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src + 48));
    T13_0D = _mm_unpacklo_epi8(T13D, zero);
    T13_1D = _mm_unpackhi_epi8(T13D, zero);
    T13D = _mm_add_epi16(T13_0D, T13_1D);
    T13 = _mm_add_epi16(T13A, T13B);
    T13 = _mm_add_epi16(T13, T13C);
    T13 = _mm_add_epi16(T13, T13D);

    T14A = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src));
    T14_0A = _mm_unpacklo_epi8(T14A, zero);
    T14_1A = _mm_unpackhi_epi8(T14A, zero);
    T14A = _mm_add_epi16(T14_0A, T14_1A);
    T14B = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src + 16));
    T14_0B = _mm_unpacklo_epi8(T14B, zero);
    T14_1B = _mm_unpackhi_epi8(T14B, zero);
    T14B = _mm_add_epi16(T14_0B, T14_1B);
    T14C = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src + 32));
    T14_0C = _mm_unpacklo_epi8(T14C, zero);
    T14_1C = _mm_unpackhi_epi8(T14C, zero);
    T14C = _mm_add_epi16(T14_0C, T14_1C);
    T14D = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src + 48));
    T14_0D = _mm_unpacklo_epi8(T14D, zero);
    T14_1D = _mm_unpackhi_epi8(T14D, zero);
    T14D = _mm_add_epi16(T14_0D, T14_1D);
    T14 = _mm_add_epi16(T14A, T14B);
    T14 = _mm_add_epi16(T14, T14C);
    T14 = _mm_add_epi16(T14, T14D);

    T15A = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src));
    T15_0A = _mm_unpacklo_epi8(T15A, zero);
    T15_1A = _mm_unpackhi_epi8(T15A, zero);
    T15A = _mm_add_epi16(T15_0A, T15_1A);
    T15B = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src + 16));
    T15_0B = _mm_unpacklo_epi8(T15B, zero);
    T15_1B = _mm_unpackhi_epi8(T15B, zero);
    T15B = _mm_add_epi16(T15_0B, T15_1B);
    T15C = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src + 32));
    T15_0C = _mm_unpacklo_epi8(T15C, zero);
    T15_1C = _mm_unpackhi_epi8(T15C, zero);
    T15C = _mm_add_epi16(T15_0C, T15_1C);
    T15D = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src + 48));
    T15_0D = _mm_unpacklo_epi8(T15D, zero);
    T15_1D = _mm_unpackhi_epi8(T15D, zero);
    T15D = _mm_add_epi16(T15_0D, T15_1D);
    T15 = _mm_add_epi16(T15A, T15B);
    T15 = _mm_add_epi16(T15, T15C);
    T15 = _mm_add_epi16(T15, T15D);

    T16A = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src));
    T16_0A = _mm_unpacklo_epi8(T16A, zero);
    T16_1A = _mm_unpackhi_epi8(T16A, zero);
    T16A = _mm_add_epi16(T16_0A, T16_1A);
    T16B = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src + 16));
    T16_0B = _mm_unpacklo_epi8(T16B, zero);
    T16_1B = _mm_unpackhi_epi8(T16B, zero);
    T16B = _mm_add_epi16(T16_0B, T16_1B);
    T16C = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src + 32));
    T16_0C = _mm_unpacklo_epi8(T16C, zero);
    T16_1C = _mm_unpackhi_epi8(T16C, zero);
    T16C = _mm_add_epi16(T16_0C, T16_1C);
    T16D = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src + 48));
    T16_0D = _mm_unpacklo_epi8(T16D, zero);
    T16_1D = _mm_unpackhi_epi8(T16D, zero);
    T16D = _mm_add_epi16(T16_0D, T16_1D);
    T16 = _mm_add_epi16(T16A, T16B);
    T16 = _mm_add_epi16(T16, T16C);
    T16 = _mm_add_epi16(T16, T16D);

    T17A = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src));
    T17_0A = _mm_unpacklo_epi8(T17A, zero);
    T17_1A = _mm_unpackhi_epi8(T17A, zero);
    T17A = _mm_add_epi16(T17_0A, T17_1A);
    T17B = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src + 16));
    T17_0B = _mm_unpacklo_epi8(T17B, zero);
    T17_1B = _mm_unpackhi_epi8(T17B, zero);
    T17B = _mm_add_epi16(T17_0B, T17_1B);
    T17C = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src + 32));
    T17_0C = _mm_unpacklo_epi8(T17C, zero);
    T17_1C = _mm_unpackhi_epi8(T17C, zero);
    T17C = _mm_add_epi16(T17_0C, T17_1C);
    T17D = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src + 48));
    T17_0D = _mm_unpacklo_epi8(T17D, zero);
    T17_1D = _mm_unpackhi_epi8(T17D, zero);
    T17D = _mm_add_epi16(T17_0D, T17_1D);
    T17 = _mm_add_epi16(T17A, T17B);
    T17 = _mm_add_epi16(T17, T17C);
    T17 = _mm_add_epi16(T17, T17D);

    T18A = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src));
    T18_0A = _mm_unpacklo_epi8(T18A, zero);
    T18_1A = _mm_unpackhi_epi8(T18A, zero);
    T18A = _mm_add_epi16(T18_0A, T18_1A);
    T18B = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src + 16));
    T18_0B = _mm_unpacklo_epi8(T18B, zero);
    T18_1B = _mm_unpackhi_epi8(T18B, zero);
    T18B = _mm_add_epi16(T18_0B, T18_1B);
    T18C = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src + 32));
    T18_0C = _mm_unpacklo_epi8(T18C, zero);
    T18_1C = _mm_unpackhi_epi8(T18C, zero);
    T18C = _mm_add_epi16(T18_0C, T18_1C);
    T18D = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src + 48));
    T18_0D = _mm_unpacklo_epi8(T18D, zero);
    T18_1D = _mm_unpackhi_epi8(T18D, zero);
    T18D = _mm_add_epi16(T18_0D, T18_1D);
    T18 = _mm_add_epi16(T18A, T18B);
    T18 = _mm_add_epi16(T18, T18C);
    T18 = _mm_add_epi16(T18, T18D);

    T19A = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src));
    T19_0A = _mm_unpacklo_epi8(T19A, zero);
    T19_1A = _mm_unpackhi_epi8(T19A, zero);
    T19A = _mm_add_epi16(T19_0A, T19_1A);
    T19B = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src + 16));
    T19_0B = _mm_unpacklo_epi8(T19B, zero);
    T19_1B = _mm_unpackhi_epi8(T19B, zero);
    T19B = _mm_add_epi16(T19_0B, T19_1B);
    T19C = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src + 32));
    T19_0C = _mm_unpacklo_epi8(T19C, zero);
    T19_1C = _mm_unpackhi_epi8(T19C, zero);
    T19C = _mm_add_epi16(T19_0C, T19_1C);
    T19D = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src + 48));
    T19_0D = _mm_unpacklo_epi8(T19D, zero);
    T19_1D = _mm_unpackhi_epi8(T19D, zero);
    T19D = _mm_add_epi16(T19_0D, T19_1D);
    T19 = _mm_add_epi16(T19A, T19B);
    T19 = _mm_add_epi16(T19, T19C);
    T19 = _mm_add_epi16(T19, T19D);

    T20A = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src));
    T20_0A = _mm_unpacklo_epi8(T20A, zero);
    T20_1A = _mm_unpackhi_epi8(T20A, zero);
    T20A = _mm_add_epi16(T20_0A, T20_1A);
    T20B = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src + 16));
    T20_0B = _mm_unpacklo_epi8(T20B, zero);
    T20_1B = _mm_unpackhi_epi8(T20B, zero);
    T20B = _mm_add_epi16(T20_0B, T20_1B);
    T20C = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src + 32));
    T20_0C = _mm_unpacklo_epi8(T20C, zero);
    T20_1C = _mm_unpackhi_epi8(T20C, zero);
    T20C = _mm_add_epi16(T20_0C, T20_1C);
    T20D = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src + 48));
    T20_0D = _mm_unpacklo_epi8(T20D, zero);
    T20_1D = _mm_unpackhi_epi8(T20D, zero);
    T20D = _mm_add_epi16(T20_0D, T20_1D);
    T20 = _mm_add_epi16(T20A, T20B);
    T20 = _mm_add_epi16(T20, T20C);
    T20 = _mm_add_epi16(T20, T20D);

    T21A = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src));
    T21_0A = _mm_unpacklo_epi8(T21A, zero);
    T21_1A = _mm_unpackhi_epi8(T21A, zero);
    T21A = _mm_add_epi16(T21_0A, T21_1A);
    T21B = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src + 16));
    T21_0B = _mm_unpacklo_epi8(T21B, zero);
    T21_1B = _mm_unpackhi_epi8(T21B, zero);
    T21B = _mm_add_epi16(T21_0B, T21_1B);
    T21C = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src + 32));
    T21_0C = _mm_unpacklo_epi8(T21C, zero);
    T21_1C = _mm_unpackhi_epi8(T21C, zero);
    T21C = _mm_add_epi16(T21_0C, T21_1C);
    T21D = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src + 48));
    T21_0D = _mm_unpacklo_epi8(T21D, zero);
    T21_1D = _mm_unpackhi_epi8(T21D, zero);
    T21D = _mm_add_epi16(T21_0D, T21_1D);
    T21 = _mm_add_epi16(T21A, T21B);
    T21 = _mm_add_epi16(T21, T21C);
    T21 = _mm_add_epi16(T21, T21D);

    T22A = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src));
    T22_0A = _mm_unpacklo_epi8(T22A, zero);
    T22_1A = _mm_unpackhi_epi8(T22A, zero);
    T22A = _mm_add_epi16(T22_0A, T22_1A);
    T22B = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src + 16));
    T22_0B = _mm_unpacklo_epi8(T22B, zero);
    T22_1B = _mm_unpackhi_epi8(T22B, zero);
    T22B = _mm_add_epi16(T22_0B, T22_1B);
    T22C = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src + 32));
    T22_0C = _mm_unpacklo_epi8(T22C, zero);
    T22_1C = _mm_unpackhi_epi8(T22C, zero);
    T22C = _mm_add_epi16(T22_0C, T22_1C);
    T22D = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src + 48));
    T22_0D = _mm_unpacklo_epi8(T22D, zero);
    T22_1D = _mm_unpackhi_epi8(T22D, zero);
    T22D = _mm_add_epi16(T22_0D, T22_1D);
    T22 = _mm_add_epi16(T22A, T22B);
    T22 = _mm_add_epi16(T22, T22C);
    T22 = _mm_add_epi16(T22, T22D);

    T23A = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src));
    T23_0A = _mm_unpacklo_epi8(T23A, zero);
    T23_1A = _mm_unpackhi_epi8(T23A, zero);
    T23A = _mm_add_epi16(T23_0A, T23_1A);
    T23B = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src + 16));
    T23_0B = _mm_unpacklo_epi8(T23B, zero);
    T23_1B = _mm_unpackhi_epi8(T23B, zero);
    T23B = _mm_add_epi16(T23_0B, T23_1B);
    T23C = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src + 32));
    T23_0C = _mm_unpacklo_epi8(T23C, zero);
    T23_1C = _mm_unpackhi_epi8(T23C, zero);
    T23C = _mm_add_epi16(T23_0C, T23_1C);
    T23D = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src + 48));
    T23_0D = _mm_unpacklo_epi8(T23D, zero);
    T23_1D = _mm_unpackhi_epi8(T23D, zero);
    T23D = _mm_add_epi16(T23_0D, T23_1D);
    T23 = _mm_add_epi16(T23A, T23B);
    T23 = _mm_add_epi16(T23, T23C);
    T23 = _mm_add_epi16(T23, T23D);

    T24A = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src));
    T24_0A = _mm_unpacklo_epi8(T24A, zero);
    T24_1A = _mm_unpackhi_epi8(T24A, zero);
    T24A = _mm_add_epi16(T24_0A, T24_1A);
    T24B = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src + 16));
    T24_0B = _mm_unpacklo_epi8(T24B, zero);
    T24_1B = _mm_unpackhi_epi8(T24B, zero);
    T24B = _mm_add_epi16(T24_0B, T24_1B);
    T24C = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src + 32));
    T24_0C = _mm_unpacklo_epi8(T24C, zero);
    T24_1C = _mm_unpackhi_epi8(T24C, zero);
    T24C = _mm_add_epi16(T24_0C, T24_1C);
    T24D = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src + 48));
    T24_0D = _mm_unpacklo_epi8(T24D, zero);
    T24_1D = _mm_unpackhi_epi8(T24D, zero);
    T24D = _mm_add_epi16(T24_0D, T24_1D);
    T24 = _mm_add_epi16(T24A, T24B);
    T24 = _mm_add_epi16(T24, T24C);
    T24 = _mm_add_epi16(T24, T24D);

    T25A = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src));
    T25_0A = _mm_unpacklo_epi8(T25A, zero);
    T25_1A = _mm_unpackhi_epi8(T25A, zero);
    T25A = _mm_add_epi16(T25_0A, T25_1A);
    T25B = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src + 16));
    T25_0B = _mm_unpacklo_epi8(T25B, zero);
    T25_1B = _mm_unpackhi_epi8(T25B, zero);
    T25B = _mm_add_epi16(T25_0B, T25_1B);
    T25C = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src + 32));
    T25_0C = _mm_unpacklo_epi8(T25C, zero);
    T25_1C = _mm_unpackhi_epi8(T25C, zero);
    T25C = _mm_add_epi16(T25_0C, T25_1C);
    T25D = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src + 48));
    T25_0D = _mm_unpacklo_epi8(T25D, zero);
    T25_1D = _mm_unpackhi_epi8(T25D, zero);
    T25D = _mm_add_epi16(T25_0D, T25_1D);
    T25 = _mm_add_epi16(T25A, T25B);
    T25 = _mm_add_epi16(T25, T25C);
    T25 = _mm_add_epi16(T25, T25D);

    T26A = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src));
    T26_0A = _mm_unpacklo_epi8(T26A, zero);
    T26_1A = _mm_unpackhi_epi8(T26A, zero);
    T26A = _mm_add_epi16(T26_0A, T26_1A);
    T26B = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src + 16));
    T26_0B = _mm_unpacklo_epi8(T26B, zero);
    T26_1B = _mm_unpackhi_epi8(T26B, zero);
    T26B = _mm_add_epi16(T26_0B, T26_1B);
    T26C = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src + 32));
    T26_0C = _mm_unpacklo_epi8(T26C, zero);
    T26_1C = _mm_unpackhi_epi8(T26C, zero);
    T26C = _mm_add_epi16(T26_0C, T26_1C);
    T26D = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src + 48));
    T26_0D = _mm_unpacklo_epi8(T26D, zero);
    T26_1D = _mm_unpackhi_epi8(T26D, zero);
    T26D = _mm_add_epi16(T26_0D, T26_1D);
    T26 = _mm_add_epi16(T26A, T26B);
    T26 = _mm_add_epi16(T26, T26C);
    T26 = _mm_add_epi16(T26, T26D);

    T27A = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src));
    T27_0A = _mm_unpacklo_epi8(T27A, zero);
    T27_1A = _mm_unpackhi_epi8(T27A, zero);
    T27A = _mm_add_epi16(T27_0A, T27_1A);
    T27B = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src + 16));
    T27_0B = _mm_unpacklo_epi8(T27B, zero);
    T27_1B = _mm_unpackhi_epi8(T27B, zero);
    T27B = _mm_add_epi16(T27_0B, T27_1B);
    T27C = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src + 32));
    T27_0C = _mm_unpacklo_epi8(T27C, zero);
    T27_1C = _mm_unpackhi_epi8(T27C, zero);
    T27C = _mm_add_epi16(T27_0C, T27_1C);
    T27D = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src + 48));
    T27_0D = _mm_unpacklo_epi8(T27D, zero);
    T27_1D = _mm_unpackhi_epi8(T27D, zero);
    T27D = _mm_add_epi16(T27_0D, T27_1D);
    T27 = _mm_add_epi16(T27A, T27B);
    T27 = _mm_add_epi16(T27, T27C);
    T27 = _mm_add_epi16(T27, T27D);

    T28A = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src));
    T28_0A = _mm_unpacklo_epi8(T28A, zero);
    T28_1A = _mm_unpackhi_epi8(T28A, zero);
    T28A = _mm_add_epi16(T28_0A, T28_1A);
    T28B = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src + 16));
    T28_0B = _mm_unpacklo_epi8(T28B, zero);
    T28_1B = _mm_unpackhi_epi8(T28B, zero);
    T28B = _mm_add_epi16(T28_0B, T28_1B);
    T28C = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src + 32));
    T28_0C = _mm_unpacklo_epi8(T28C, zero);
    T28_1C = _mm_unpackhi_epi8(T28C, zero);
    T28C = _mm_add_epi16(T28_0C, T28_1C);
    T28D = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src + 48));
    T28_0D = _mm_unpacklo_epi8(T28D, zero);
    T28_1D = _mm_unpackhi_epi8(T28D, zero);
    T28D = _mm_add_epi16(T28_0D, T28_1D);
    T28 = _mm_add_epi16(T28A, T28B);
    T28 = _mm_add_epi16(T28, T28C);
    T28 = _mm_add_epi16(T28, T28D);

    T29A = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src));
    T29_0A = _mm_unpacklo_epi8(T29A, zero);
    T29_1A = _mm_unpackhi_epi8(T29A, zero);
    T29A = _mm_add_epi16(T29_0A, T29_1A);
    T29B = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src + 16));
    T29_0B = _mm_unpacklo_epi8(T29B, zero);
    T29_1B = _mm_unpackhi_epi8(T29B, zero);
    T29B = _mm_add_epi16(T29_0B, T29_1B);
    T29C = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src + 32));
    T29_0C = _mm_unpacklo_epi8(T29C, zero);
    T29_1C = _mm_unpackhi_epi8(T29C, zero);
    T29C = _mm_add_epi16(T29_0C, T29_1C);
    T29D = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src + 48));
    T29_0D = _mm_unpacklo_epi8(T29D, zero);
    T29_1D = _mm_unpackhi_epi8(T29D, zero);
    T29D = _mm_add_epi16(T29_0D, T29_1D);
    T29 = _mm_add_epi16(T29A, T29B);
    T29 = _mm_add_epi16(T29, T29C);
    T29 = _mm_add_epi16(T29, T29D);

    T30A = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src));
    T30_0A = _mm_unpacklo_epi8(T30A, zero);
    T30_1A = _mm_unpackhi_epi8(T30A, zero);
    T30A = _mm_add_epi16(T30_0A, T30_1A);
    T30B = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src + 16));
    T30_0B = _mm_unpacklo_epi8(T30B, zero);
    T30_1B = _mm_unpackhi_epi8(T30B, zero);
    T30B = _mm_add_epi16(T30_0B, T30_1B);
    T30C = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src + 32));
    T30_0C = _mm_unpacklo_epi8(T30C, zero);
    T30_1C = _mm_unpackhi_epi8(T30C, zero);
    T30C = _mm_add_epi16(T30_0C, T30_1C);
    T30D = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src + 48));
    T30_0D = _mm_unpacklo_epi8(T30D, zero);
    T30_1D = _mm_unpackhi_epi8(T30D, zero);
    T30D = _mm_add_epi16(T30_0D, T30_1D);
    T30 = _mm_add_epi16(T30A, T30B);
    T30 = _mm_add_epi16(T30, T30C);
    T30 = _mm_add_epi16(T30, T30D);

    T31A = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src));
    T31_0A = _mm_unpacklo_epi8(T31A, zero);
    T31_1A = _mm_unpackhi_epi8(T31A, zero);
    T31A = _mm_add_epi16(T31_0A, T31_1A);
    T31B = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src + 16));
    T31_0B = _mm_unpacklo_epi8(T31B, zero);
    T31_1B = _mm_unpackhi_epi8(T31B, zero);
    T31B = _mm_add_epi16(T31_0B, T31_1B);
    T31C = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src + 32));
    T31_0C = _mm_unpacklo_epi8(T31C, zero);
    T31_1C = _mm_unpackhi_epi8(T31C, zero);
    T31C = _mm_add_epi16(T31_0C, T31_1C);
    T31D = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src + 48));
    T31_0D = _mm_unpacklo_epi8(T31D, zero);
    T31_1D = _mm_unpackhi_epi8(T31D, zero);
    T31D = _mm_add_epi16(T31_0D, T31_1D);
    T31 = _mm_add_epi16(T31A, T31B);
    T31 = _mm_add_epi16(T31, T31C);
    T31 = _mm_add_epi16(T31, T31D);

    T32A = _mm_loadu_si128((__m128i *)(p_src + 32 * i_src));
    T32_0A = _mm_unpacklo_epi8(T32A, zero);
    T32_1A = _mm_unpackhi_epi8(T32A, zero);
    T32A = _mm_add_epi16(T32_0A, T32_1A);
    T32B = _mm_loadu_si128((__m128i *)(p_src + 32 * i_src + 16));
    T32_0B = _mm_unpacklo_epi8(T32B, zero);
    T32_1B = _mm_unpackhi_epi8(T32B, zero);
    T32B = _mm_add_epi16(T32_0B, T32_1B);
    T32C = _mm_loadu_si128((__m128i *)(p_src + 32 * i_src + 32));
    T32_0C = _mm_unpacklo_epi8(T32C, zero);
    T32_1C = _mm_unpackhi_epi8(T32C, zero);
    T32C = _mm_add_epi16(T32_0C, T32_1C);
    T32D = _mm_loadu_si128((__m128i *)(p_src + 32 * i_src + 48));
    T32_0D = _mm_unpacklo_epi8(T32D, zero);
    T32_1D = _mm_unpackhi_epi8(T32D, zero);
    T32D = _mm_add_epi16(T32_0D, T32_1D);
    T32 = _mm_add_epi16(T32A, T32B);
    T32 = _mm_add_epi16(T32, T32C);
    T32 = _mm_add_epi16(T32, T32D);

    T33A = _mm_loadu_si128((__m128i *)(p_src + 33 * i_src));
    T33_0A = _mm_unpacklo_epi8(T33A, zero);
    T33_1A = _mm_unpackhi_epi8(T33A, zero);
    T33A = _mm_add_epi16(T33_0A, T33_1A);
    T33B = _mm_loadu_si128((__m128i *)(p_src + 33 * i_src + 16));
    T33_0B = _mm_unpacklo_epi8(T33B, zero);
    T33_1B = _mm_unpackhi_epi8(T33B, zero);
    T33B = _mm_add_epi16(T33_0B, T33_1B);
    T33C = _mm_loadu_si128((__m128i *)(p_src + 33 * i_src + 32));
    T33_0C = _mm_unpacklo_epi8(T33C, zero);
    T33_1C = _mm_unpackhi_epi8(T33C, zero);
    T33C = _mm_add_epi16(T33_0C, T33_1C);
    T33D = _mm_loadu_si128((__m128i *)(p_src + 33 * i_src + 48));
    T33_0D = _mm_unpacklo_epi8(T33D, zero);
    T33_1D = _mm_unpackhi_epi8(T33D, zero);
    T33D = _mm_add_epi16(T33_0D, T33_1D);
    T33 = _mm_add_epi16(T33A, T33B);
    T33 = _mm_add_epi16(T33, T33C);
    T33 = _mm_add_epi16(T33, T33D);

    T34A = _mm_loadu_si128((__m128i *)(p_src + 34 * i_src));
    T34_0A = _mm_unpacklo_epi8(T34A, zero);
    T34_1A = _mm_unpackhi_epi8(T34A, zero);
    T34A = _mm_add_epi16(T34_0A, T34_1A);
    T34B = _mm_loadu_si128((__m128i *)(p_src + 34 * i_src + 16));
    T34_0B = _mm_unpacklo_epi8(T34B, zero);
    T34_1B = _mm_unpackhi_epi8(T34B, zero);
    T34B = _mm_add_epi16(T34_0B, T34_1B);
    T34C = _mm_loadu_si128((__m128i *)(p_src + 34 * i_src + 32));
    T34_0C = _mm_unpacklo_epi8(T34C, zero);
    T34_1C = _mm_unpackhi_epi8(T34C, zero);
    T34C = _mm_add_epi16(T34_0C, T34_1C);
    T34D = _mm_loadu_si128((__m128i *)(p_src + 34 * i_src + 48));
    T34_0D = _mm_unpacklo_epi8(T34D, zero);
    T34_1D = _mm_unpackhi_epi8(T34D, zero);
    T34D = _mm_add_epi16(T34_0D, T34_1D);
    T34 = _mm_add_epi16(T34A, T34B);
    T34 = _mm_add_epi16(T34, T34C);
    T34 = _mm_add_epi16(T34, T34D);

    T35A = _mm_loadu_si128((__m128i *)(p_src + 35 * i_src));
    T35_0A = _mm_unpacklo_epi8(T35A, zero);
    T35_1A = _mm_unpackhi_epi8(T35A, zero);
    T35A = _mm_add_epi16(T35_0A, T35_1A);
    T35B = _mm_loadu_si128((__m128i *)(p_src + 35 * i_src + 16));
    T35_0B = _mm_unpacklo_epi8(T35B, zero);
    T35_1B = _mm_unpackhi_epi8(T35B, zero);
    T35B = _mm_add_epi16(T35_0B, T35_1B);
    T35C = _mm_loadu_si128((__m128i *)(p_src + 35 * i_src + 32));
    T35_0C = _mm_unpacklo_epi8(T35C, zero);
    T35_1C = _mm_unpackhi_epi8(T35C, zero);
    T35C = _mm_add_epi16(T35_0C, T35_1C);
    T35D = _mm_loadu_si128((__m128i *)(p_src + 35 * i_src + 48));
    T35_0D = _mm_unpacklo_epi8(T35D, zero);
    T35_1D = _mm_unpackhi_epi8(T35D, zero);
    T35D = _mm_add_epi16(T35_0D, T35_1D);
    T35 = _mm_add_epi16(T35A, T35B);
    T35 = _mm_add_epi16(T35, T35C);
    T35 = _mm_add_epi16(T35, T35D);

    T36A = _mm_loadu_si128((__m128i *)(p_src + 36 * i_src));
    T36_0A = _mm_unpacklo_epi8(T36A, zero);
    T36_1A = _mm_unpackhi_epi8(T36A, zero);
    T36A = _mm_add_epi16(T36_0A, T36_1A);
    T36B = _mm_loadu_si128((__m128i *)(p_src + 36 * i_src + 16));
    T36_0B = _mm_unpacklo_epi8(T36B, zero);
    T36_1B = _mm_unpackhi_epi8(T36B, zero);
    T36B = _mm_add_epi16(T36_0B, T36_1B);
    T36C = _mm_loadu_si128((__m128i *)(p_src + 36 * i_src + 32));
    T36_0C = _mm_unpacklo_epi8(T36C, zero);
    T36_1C = _mm_unpackhi_epi8(T36C, zero);
    T36C = _mm_add_epi16(T36_0C, T36_1C);
    T36D = _mm_loadu_si128((__m128i *)(p_src + 36 * i_src + 48));
    T36_0D = _mm_unpacklo_epi8(T36D, zero);
    T36_1D = _mm_unpackhi_epi8(T36D, zero);
    T36D = _mm_add_epi16(T36_0D, T36_1D);
    T36 = _mm_add_epi16(T36A, T36B);
    T36 = _mm_add_epi16(T36, T36C);
    T36 = _mm_add_epi16(T36, T36D);

    T37A = _mm_loadu_si128((__m128i *)(p_src + 37 * i_src));
    T37_0A = _mm_unpacklo_epi8(T37A, zero);
    T37_1A = _mm_unpackhi_epi8(T37A, zero);
    T37A = _mm_add_epi16(T37_0A, T37_1A);
    T37B = _mm_loadu_si128((__m128i *)(p_src + 37 * i_src + 16));
    T37_0B = _mm_unpacklo_epi8(T37B, zero);
    T37_1B = _mm_unpackhi_epi8(T37B, zero);
    T37B = _mm_add_epi16(T37_0B, T37_1B);
    T37C = _mm_loadu_si128((__m128i *)(p_src + 37 * i_src + 32));
    T37_0C = _mm_unpacklo_epi8(T37C, zero);
    T37_1C = _mm_unpackhi_epi8(T37C, zero);
    T37C = _mm_add_epi16(T37_0C, T37_1C);
    T37D = _mm_loadu_si128((__m128i *)(p_src + 37 * i_src + 48));
    T37_0D = _mm_unpacklo_epi8(T37D, zero);
    T37_1D = _mm_unpackhi_epi8(T37D, zero);
    T37D = _mm_add_epi16(T37_0D, T37_1D);
    T37 = _mm_add_epi16(T37A, T37B);
    T37 = _mm_add_epi16(T37, T37C);
    T37 = _mm_add_epi16(T37, T37D);

    T38A = _mm_loadu_si128((__m128i *)(p_src + 38 * i_src));
    T38_0A = _mm_unpacklo_epi8(T38A, zero);
    T38_1A = _mm_unpackhi_epi8(T38A, zero);
    T38A = _mm_add_epi16(T38_0A, T38_1A);
    T38B = _mm_loadu_si128((__m128i *)(p_src + 38 * i_src + 16));
    T38_0B = _mm_unpacklo_epi8(T38B, zero);
    T38_1B = _mm_unpackhi_epi8(T38B, zero);
    T38B = _mm_add_epi16(T38_0B, T38_1B);
    T38C = _mm_loadu_si128((__m128i *)(p_src + 38 * i_src + 32));
    T38_0C = _mm_unpacklo_epi8(T38C, zero);
    T38_1C = _mm_unpackhi_epi8(T38C, zero);
    T38C = _mm_add_epi16(T38_0C, T38_1C);
    T38D = _mm_loadu_si128((__m128i *)(p_src + 38 * i_src + 48));
    T38_0D = _mm_unpacklo_epi8(T38D, zero);
    T38_1D = _mm_unpackhi_epi8(T38D, zero);
    T38D = _mm_add_epi16(T38_0D, T38_1D);
    T38 = _mm_add_epi16(T38A, T38B);
    T38 = _mm_add_epi16(T38, T38C);
    T38 = _mm_add_epi16(T38, T38D);

    T39A = _mm_loadu_si128((__m128i *)(p_src + 39 * i_src));
    T39_0A = _mm_unpacklo_epi8(T39A, zero);
    T39_1A = _mm_unpackhi_epi8(T39A, zero);
    T39A = _mm_add_epi16(T39_0A, T39_1A);
    T39B = _mm_loadu_si128((__m128i *)(p_src + 39 * i_src + 16));
    T39_0B = _mm_unpacklo_epi8(T39B, zero);
    T39_1B = _mm_unpackhi_epi8(T39B, zero);
    T39B = _mm_add_epi16(T39_0B, T39_1B);
    T39C = _mm_loadu_si128((__m128i *)(p_src + 39 * i_src + 32));
    T39_0C = _mm_unpacklo_epi8(T39C, zero);
    T39_1C = _mm_unpackhi_epi8(T39C, zero);
    T39C = _mm_add_epi16(T39_0C, T39_1C);
    T39D = _mm_loadu_si128((__m128i *)(p_src + 39 * i_src + 48));
    T39_0D = _mm_unpacklo_epi8(T39D, zero);
    T39_1D = _mm_unpackhi_epi8(T39D, zero);
    T39D = _mm_add_epi16(T39_0D, T39_1D);
    T39 = _mm_add_epi16(T39A, T39B);
    T39 = _mm_add_epi16(T39, T39C);
    T39 = _mm_add_epi16(T39, T39D);

    T40A = _mm_loadu_si128((__m128i *)(p_src + 40 * i_src));
    T40_0A = _mm_unpacklo_epi8(T40A, zero);
    T40_1A = _mm_unpackhi_epi8(T40A, zero);
    T40A = _mm_add_epi16(T40_0A, T40_1A);
    T40B = _mm_loadu_si128((__m128i *)(p_src + 40 * i_src + 16));
    T40_0B = _mm_unpacklo_epi8(T40B, zero);
    T40_1B = _mm_unpackhi_epi8(T40B, zero);
    T40B = _mm_add_epi16(T40_0B, T40_1B);
    T40C = _mm_loadu_si128((__m128i *)(p_src + 40 * i_src + 32));
    T40_0C = _mm_unpacklo_epi8(T40C, zero);
    T40_1C = _mm_unpackhi_epi8(T40C, zero);
    T40C = _mm_add_epi16(T40_0C, T40_1C);
    T40D = _mm_loadu_si128((__m128i *)(p_src + 40 * i_src + 48));
    T40_0D = _mm_unpacklo_epi8(T40D, zero);
    T40_1D = _mm_unpackhi_epi8(T40D, zero);
    T40D = _mm_add_epi16(T40_0D, T40_1D);
    T40 = _mm_add_epi16(T40A, T40B);
    T40 = _mm_add_epi16(T40, T40C);
    T40 = _mm_add_epi16(T40, T40D);

    T41A = _mm_loadu_si128((__m128i *)(p_src + 41 * i_src));
    T41_0A = _mm_unpacklo_epi8(T41A, zero);
    T41_1A = _mm_unpackhi_epi8(T41A, zero);
    T41A = _mm_add_epi16(T41_0A, T41_1A);
    T41B = _mm_loadu_si128((__m128i *)(p_src + 41 * i_src + 16));
    T41_0B = _mm_unpacklo_epi8(T41B, zero);
    T41_1B = _mm_unpackhi_epi8(T41B, zero);
    T41B = _mm_add_epi16(T41_0B, T41_1B);
    T41C = _mm_loadu_si128((__m128i *)(p_src + 41 * i_src + 32));
    T41_0C = _mm_unpacklo_epi8(T41C, zero);
    T41_1C = _mm_unpackhi_epi8(T41C, zero);
    T41C = _mm_add_epi16(T41_0C, T41_1C);
    T41D = _mm_loadu_si128((__m128i *)(p_src + 41 * i_src + 48));
    T41_0D = _mm_unpacklo_epi8(T41D, zero);
    T41_1D = _mm_unpackhi_epi8(T41D, zero);
    T41D = _mm_add_epi16(T41_0D, T41_1D);
    T41 = _mm_add_epi16(T41A, T41B);
    T41 = _mm_add_epi16(T41, T41C);
    T41 = _mm_add_epi16(T41, T41D);

    T42A = _mm_loadu_si128((__m128i *)(p_src + 42 * i_src));
    T42_0A = _mm_unpacklo_epi8(T42A, zero);
    T42_1A = _mm_unpackhi_epi8(T42A, zero);
    T42A = _mm_add_epi16(T42_0A, T42_1A);
    T42B = _mm_loadu_si128((__m128i *)(p_src + 42 * i_src + 16));
    T42_0B = _mm_unpacklo_epi8(T42B, zero);
    T42_1B = _mm_unpackhi_epi8(T42B, zero);
    T42B = _mm_add_epi16(T42_0B, T42_1B);
    T42C = _mm_loadu_si128((__m128i *)(p_src + 42 * i_src + 32));
    T42_0C = _mm_unpacklo_epi8(T42C, zero);
    T42_1C = _mm_unpackhi_epi8(T42C, zero);
    T42C = _mm_add_epi16(T42_0C, T42_1C);
    T42D = _mm_loadu_si128((__m128i *)(p_src + 42 * i_src + 48));
    T42_0D = _mm_unpacklo_epi8(T42D, zero);
    T42_1D = _mm_unpackhi_epi8(T42D, zero);
    T42D = _mm_add_epi16(T42_0D, T42_1D);
    T42 = _mm_add_epi16(T42A, T42B);
    T42 = _mm_add_epi16(T42, T42C);
    T42 = _mm_add_epi16(T42, T42D);

    T43A = _mm_loadu_si128((__m128i *)(p_src + 43 * i_src));
    T43_0A = _mm_unpacklo_epi8(T43A, zero);
    T43_1A = _mm_unpackhi_epi8(T43A, zero);
    T43A = _mm_add_epi16(T43_0A, T43_1A);
    T43B = _mm_loadu_si128((__m128i *)(p_src + 43 * i_src + 16));
    T43_0B = _mm_unpacklo_epi8(T43B, zero);
    T43_1B = _mm_unpackhi_epi8(T43B, zero);
    T43B = _mm_add_epi16(T43_0B, T43_1B);
    T43C = _mm_loadu_si128((__m128i *)(p_src + 43 * i_src + 32));
    T43_0C = _mm_unpacklo_epi8(T43C, zero);
    T43_1C = _mm_unpackhi_epi8(T43C, zero);
    T43C = _mm_add_epi16(T43_0C, T43_1C);
    T43D = _mm_loadu_si128((__m128i *)(p_src + 43 * i_src + 48));
    T43_0D = _mm_unpacklo_epi8(T43D, zero);
    T43_1D = _mm_unpackhi_epi8(T43D, zero);
    T43D = _mm_add_epi16(T43_0D, T43_1D);
    T43 = _mm_add_epi16(T43A, T43B);
    T43 = _mm_add_epi16(T43, T43C);
    T43 = _mm_add_epi16(T43, T43D);

    T44A = _mm_loadu_si128((__m128i *)(p_src + 44 * i_src));
    T44_0A = _mm_unpacklo_epi8(T44A, zero);
    T44_1A = _mm_unpackhi_epi8(T44A, zero);
    T44A = _mm_add_epi16(T44_0A, T44_1A);
    T44B = _mm_loadu_si128((__m128i *)(p_src + 44 * i_src + 16));
    T44_0B = _mm_unpacklo_epi8(T44B, zero);
    T44_1B = _mm_unpackhi_epi8(T44B, zero);
    T44B = _mm_add_epi16(T44_0B, T44_1B);
    T44C = _mm_loadu_si128((__m128i *)(p_src + 44 * i_src + 32));
    T44_0C = _mm_unpacklo_epi8(T44C, zero);
    T44_1C = _mm_unpackhi_epi8(T44C, zero);
    T44C = _mm_add_epi16(T44_0C, T44_1C);
    T44D = _mm_loadu_si128((__m128i *)(p_src + 44 * i_src + 48));
    T44_0D = _mm_unpacklo_epi8(T44D, zero);
    T44_1D = _mm_unpackhi_epi8(T44D, zero);
    T44D = _mm_add_epi16(T44_0D, T44_1D);
    T44 = _mm_add_epi16(T44A, T44B);
    T44 = _mm_add_epi16(T44, T44C);
    T44 = _mm_add_epi16(T44, T44D);

    T45A = _mm_loadu_si128((__m128i *)(p_src + 45 * i_src));
    T45_0A = _mm_unpacklo_epi8(T45A, zero);
    T45_1A = _mm_unpackhi_epi8(T45A, zero);
    T45A = _mm_add_epi16(T45_0A, T45_1A);
    T45B = _mm_loadu_si128((__m128i *)(p_src + 45 * i_src + 16));
    T45_0B = _mm_unpacklo_epi8(T45B, zero);
    T45_1B = _mm_unpackhi_epi8(T45B, zero);
    T45B = _mm_add_epi16(T45_0B, T45_1B);
    T45C = _mm_loadu_si128((__m128i *)(p_src + 45 * i_src + 32));
    T45_0C = _mm_unpacklo_epi8(T45C, zero);
    T45_1C = _mm_unpackhi_epi8(T45C, zero);
    T45C = _mm_add_epi16(T45_0C, T45_1C);
    T45D = _mm_loadu_si128((__m128i *)(p_src + 45 * i_src + 48));
    T45_0D = _mm_unpacklo_epi8(T45D, zero);
    T45_1D = _mm_unpackhi_epi8(T45D, zero);
    T45D = _mm_add_epi16(T45_0D, T45_1D);
    T45 = _mm_add_epi16(T45A, T45B);
    T45 = _mm_add_epi16(T45, T45C);
    T45 = _mm_add_epi16(T45, T45D);

    T46A = _mm_loadu_si128((__m128i *)(p_src + 46 * i_src));
    T46_0A = _mm_unpacklo_epi8(T46A, zero);
    T46_1A = _mm_unpackhi_epi8(T46A, zero);
    T46A = _mm_add_epi16(T46_0A, T46_1A);
    T46B = _mm_loadu_si128((__m128i *)(p_src + 46 * i_src + 16));
    T46_0B = _mm_unpacklo_epi8(T46B, zero);
    T46_1B = _mm_unpackhi_epi8(T46B, zero);
    T46B = _mm_add_epi16(T46_0B, T46_1B);
    T46C = _mm_loadu_si128((__m128i *)(p_src + 46 * i_src + 32));
    T46_0C = _mm_unpacklo_epi8(T46C, zero);
    T46_1C = _mm_unpackhi_epi8(T46C, zero);
    T46C = _mm_add_epi16(T46_0C, T46_1C);
    T46D = _mm_loadu_si128((__m128i *)(p_src + 46 * i_src + 48));
    T46_0D = _mm_unpacklo_epi8(T46D, zero);
    T46_1D = _mm_unpackhi_epi8(T46D, zero);
    T46D = _mm_add_epi16(T46_0D, T46_1D);
    T46 = _mm_add_epi16(T46A, T46B);
    T46 = _mm_add_epi16(T46, T46C);
    T46 = _mm_add_epi16(T46, T46D);

    T47A = _mm_loadu_si128((__m128i *)(p_src + 47 * i_src));
    T47_0A = _mm_unpacklo_epi8(T47A, zero);
    T47_1A = _mm_unpackhi_epi8(T47A, zero);
    T47A = _mm_add_epi16(T47_0A, T47_1A);
    T47B = _mm_loadu_si128((__m128i *)(p_src + 47 * i_src + 16));
    T47_0B = _mm_unpacklo_epi8(T47B, zero);
    T47_1B = _mm_unpackhi_epi8(T47B, zero);
    T47B = _mm_add_epi16(T47_0B, T47_1B);
    T47C = _mm_loadu_si128((__m128i *)(p_src + 47 * i_src + 32));
    T47_0C = _mm_unpacklo_epi8(T47C, zero);
    T47_1C = _mm_unpackhi_epi8(T47C, zero);
    T47C = _mm_add_epi16(T47_0C, T47_1C);
    T47D = _mm_loadu_si128((__m128i *)(p_src + 47 * i_src + 48));
    T47_0D = _mm_unpacklo_epi8(T47D, zero);
    T47_1D = _mm_unpackhi_epi8(T47D, zero);
    T47D = _mm_add_epi16(T47_0D, T47_1D);
    T47 = _mm_add_epi16(T47A, T47B);
    T47 = _mm_add_epi16(T47, T47C);
    T47 = _mm_add_epi16(T47, T47D);

    T48A = _mm_loadu_si128((__m128i *)(p_src + 48 * i_src));
    T48_0A = _mm_unpacklo_epi8(T48A, zero);
    T48_1A = _mm_unpackhi_epi8(T48A, zero);
    T48A = _mm_add_epi16(T48_0A, T48_1A);
    T48B = _mm_loadu_si128((__m128i *)(p_src + 48 * i_src + 16));
    T48_0B = _mm_unpacklo_epi8(T48B, zero);
    T48_1B = _mm_unpackhi_epi8(T48B, zero);
    T48B = _mm_add_epi16(T48_0B, T48_1B);
    T48C = _mm_loadu_si128((__m128i *)(p_src + 48 * i_src + 32));
    T48_0C = _mm_unpacklo_epi8(T48C, zero);
    T48_1C = _mm_unpackhi_epi8(T48C, zero);
    T48C = _mm_add_epi16(T48_0C, T48_1C);
    T48D = _mm_loadu_si128((__m128i *)(p_src + 48 * i_src + 48));
    T48_0D = _mm_unpacklo_epi8(T48D, zero);
    T48_1D = _mm_unpackhi_epi8(T48D, zero);
    T48D = _mm_add_epi16(T48_0D, T48_1D);
    T48 = _mm_add_epi16(T48A, T48B);
    T48 = _mm_add_epi16(T48, T48C);
    T48 = _mm_add_epi16(T48, T48D);

    T49A = _mm_loadu_si128((__m128i *)(p_src + 49 * i_src));
    T49_0A = _mm_unpacklo_epi8(T49A, zero);
    T49_1A = _mm_unpackhi_epi8(T49A, zero);
    T49A = _mm_add_epi16(T49_0A, T49_1A);
    T49B = _mm_loadu_si128((__m128i *)(p_src + 49 * i_src + 16));
    T49_0B = _mm_unpacklo_epi8(T49B, zero);
    T49_1B = _mm_unpackhi_epi8(T49B, zero);
    T49B = _mm_add_epi16(T49_0B, T49_1B);
    T49C = _mm_loadu_si128((__m128i *)(p_src + 49 * i_src + 32));
    T49_0C = _mm_unpacklo_epi8(T49C, zero);
    T49_1C = _mm_unpackhi_epi8(T49C, zero);
    T49C = _mm_add_epi16(T49_0C, T49_1C);
    T49D = _mm_loadu_si128((__m128i *)(p_src + 49 * i_src + 48));
    T49_0D = _mm_unpacklo_epi8(T49D, zero);
    T49_1D = _mm_unpackhi_epi8(T49D, zero);
    T49D = _mm_add_epi16(T49_0D, T49_1D);
    T49 = _mm_add_epi16(T49A, T49B);
    T49 = _mm_add_epi16(T49, T49C);
    T49 = _mm_add_epi16(T49, T49D);

    T50A = _mm_loadu_si128((__m128i *)(p_src + 50 * i_src));
    T50_0A = _mm_unpacklo_epi8(T50A, zero);
    T50_1A = _mm_unpackhi_epi8(T50A, zero);
    T50A = _mm_add_epi16(T50_0A, T50_1A);
    T50B = _mm_loadu_si128((__m128i *)(p_src + 50 * i_src + 16));
    T50_0B = _mm_unpacklo_epi8(T50B, zero);
    T50_1B = _mm_unpackhi_epi8(T50B, zero);
    T50B = _mm_add_epi16(T50_0B, T50_1B);
    T50C = _mm_loadu_si128((__m128i *)(p_src + 50 * i_src + 32));
    T50_0C = _mm_unpacklo_epi8(T50C, zero);
    T50_1C = _mm_unpackhi_epi8(T50C, zero);
    T50C = _mm_add_epi16(T50_0C, T50_1C);
    T50D = _mm_loadu_si128((__m128i *)(p_src + 50 * i_src + 48));
    T50_0D = _mm_unpacklo_epi8(T50D, zero);
    T50_1D = _mm_unpackhi_epi8(T50D, zero);
    T50D = _mm_add_epi16(T50_0D, T50_1D);
    T50 = _mm_add_epi16(T50A, T50B);
    T50 = _mm_add_epi16(T50, T50C);
    T50 = _mm_add_epi16(T50, T50D);

    T51A = _mm_loadu_si128((__m128i *)(p_src + 51 * i_src));
    T51_0A = _mm_unpacklo_epi8(T51A, zero);
    T51_1A = _mm_unpackhi_epi8(T51A, zero);
    T51A = _mm_add_epi16(T51_0A, T51_1A);
    T51B = _mm_loadu_si128((__m128i *)(p_src + 51 * i_src + 16));
    T51_0B = _mm_unpacklo_epi8(T51B, zero);
    T51_1B = _mm_unpackhi_epi8(T51B, zero);
    T51B = _mm_add_epi16(T51_0B, T51_1B);
    T51C = _mm_loadu_si128((__m128i *)(p_src + 51 * i_src + 32));
    T51_0C = _mm_unpacklo_epi8(T51C, zero);
    T51_1C = _mm_unpackhi_epi8(T51C, zero);
    T51C = _mm_add_epi16(T51_0C, T51_1C);
    T51D = _mm_loadu_si128((__m128i *)(p_src + 51 * i_src + 48));
    T51_0D = _mm_unpacklo_epi8(T51D, zero);
    T51_1D = _mm_unpackhi_epi8(T51D, zero);
    T51D = _mm_add_epi16(T51_0D, T51_1D);
    T51 = _mm_add_epi16(T51A, T51B);
    T51 = _mm_add_epi16(T51, T51C);
    T51 = _mm_add_epi16(T51, T51D);

    T52A = _mm_loadu_si128((__m128i *)(p_src + 52 * i_src));
    T52_0A = _mm_unpacklo_epi8(T52A, zero);
    T52_1A = _mm_unpackhi_epi8(T52A, zero);
    T52A = _mm_add_epi16(T52_0A, T52_1A);
    T52B = _mm_loadu_si128((__m128i *)(p_src + 52 * i_src + 16));
    T52_0B = _mm_unpacklo_epi8(T52B, zero);
    T52_1B = _mm_unpackhi_epi8(T52B, zero);
    T52B = _mm_add_epi16(T52_0B, T52_1B);
    T52C = _mm_loadu_si128((__m128i *)(p_src + 52 * i_src + 32));
    T52_0C = _mm_unpacklo_epi8(T52C, zero);
    T52_1C = _mm_unpackhi_epi8(T52C, zero);
    T52C = _mm_add_epi16(T52_0C, T52_1C);
    T52D = _mm_loadu_si128((__m128i *)(p_src + 52 * i_src + 48));
    T52_0D = _mm_unpacklo_epi8(T52D, zero);
    T52_1D = _mm_unpackhi_epi8(T52D, zero);
    T52D = _mm_add_epi16(T52_0D, T52_1D);
    T52 = _mm_add_epi16(T52A, T52B);
    T52 = _mm_add_epi16(T52, T52C);
    T52 = _mm_add_epi16(T52, T52D);

    T53A = _mm_loadu_si128((__m128i *)(p_src + 53 * i_src));
    T53_0A = _mm_unpacklo_epi8(T53A, zero);
    T53_1A = _mm_unpackhi_epi8(T53A, zero);
    T53A = _mm_add_epi16(T53_0A, T53_1A);
    T53B = _mm_loadu_si128((__m128i *)(p_src + 53 * i_src + 16));
    T53_0B = _mm_unpacklo_epi8(T53B, zero);
    T53_1B = _mm_unpackhi_epi8(T53B, zero);
    T53B = _mm_add_epi16(T53_0B, T53_1B);
    T53C = _mm_loadu_si128((__m128i *)(p_src + 53 * i_src + 32));
    T53_0C = _mm_unpacklo_epi8(T53C, zero);
    T53_1C = _mm_unpackhi_epi8(T53C, zero);
    T53C = _mm_add_epi16(T53_0C, T53_1C);
    T53D = _mm_loadu_si128((__m128i *)(p_src + 53 * i_src + 48));
    T53_0D = _mm_unpacklo_epi8(T53D, zero);
    T53_1D = _mm_unpackhi_epi8(T53D, zero);
    T53D = _mm_add_epi16(T53_0D, T53_1D);
    T53 = _mm_add_epi16(T53A, T53B);
    T53 = _mm_add_epi16(T53, T53C);
    T53 = _mm_add_epi16(T53, T53D);

    T54A = _mm_loadu_si128((__m128i *)(p_src + 54 * i_src));
    T54_0A = _mm_unpacklo_epi8(T54A, zero);
    T54_1A = _mm_unpackhi_epi8(T54A, zero);
    T54A = _mm_add_epi16(T54_0A, T54_1A);
    T54B = _mm_loadu_si128((__m128i *)(p_src + 54 * i_src + 16));
    T54_0B = _mm_unpacklo_epi8(T54B, zero);
    T54_1B = _mm_unpackhi_epi8(T54B, zero);
    T54B = _mm_add_epi16(T54_0B, T54_1B);
    T54C = _mm_loadu_si128((__m128i *)(p_src + 54 * i_src + 32));
    T54_0C = _mm_unpacklo_epi8(T54C, zero);
    T54_1C = _mm_unpackhi_epi8(T54C, zero);
    T54C = _mm_add_epi16(T54_0C, T54_1C);
    T54D = _mm_loadu_si128((__m128i *)(p_src + 54 * i_src + 48));
    T54_0D = _mm_unpacklo_epi8(T54D, zero);
    T54_1D = _mm_unpackhi_epi8(T54D, zero);
    T54D = _mm_add_epi16(T54_0D, T54_1D);
    T54 = _mm_add_epi16(T54A, T54B);
    T54 = _mm_add_epi16(T54, T54C);
    T54 = _mm_add_epi16(T54, T54D);

    T55A = _mm_loadu_si128((__m128i *)(p_src + 55 * i_src));
    T55_0A = _mm_unpacklo_epi8(T55A, zero);
    T55_1A = _mm_unpackhi_epi8(T55A, zero);
    T55A = _mm_add_epi16(T55_0A, T55_1A);
    T55B = _mm_loadu_si128((__m128i *)(p_src + 55 * i_src + 16));
    T55_0B = _mm_unpacklo_epi8(T55B, zero);
    T55_1B = _mm_unpackhi_epi8(T55B, zero);
    T55B = _mm_add_epi16(T55_0B, T55_1B);
    T55C = _mm_loadu_si128((__m128i *)(p_src + 55 * i_src + 32));
    T55_0C = _mm_unpacklo_epi8(T55C, zero);
    T55_1C = _mm_unpackhi_epi8(T55C, zero);
    T55C = _mm_add_epi16(T55_0C, T55_1C);
    T55D = _mm_loadu_si128((__m128i *)(p_src + 55 * i_src + 48));
    T55_0D = _mm_unpacklo_epi8(T55D, zero);
    T55_1D = _mm_unpackhi_epi8(T55D, zero);
    T55D = _mm_add_epi16(T55_0D, T55_1D);
    T55 = _mm_add_epi16(T55A, T55B);
    T55 = _mm_add_epi16(T55, T55C);
    T55 = _mm_add_epi16(T55, T55D);

    T56A = _mm_loadu_si128((__m128i *)(p_src + 56 * i_src));
    T56_0A = _mm_unpacklo_epi8(T56A, zero);
    T56_1A = _mm_unpackhi_epi8(T56A, zero);
    T56A = _mm_add_epi16(T56_0A, T56_1A);
    T56B = _mm_loadu_si128((__m128i *)(p_src + 56 * i_src + 16));
    T56_0B = _mm_unpacklo_epi8(T56B, zero);
    T56_1B = _mm_unpackhi_epi8(T56B, zero);
    T56B = _mm_add_epi16(T56_0B, T56_1B);
    T56C = _mm_loadu_si128((__m128i *)(p_src + 56 * i_src + 32));
    T56_0C = _mm_unpacklo_epi8(T56C, zero);
    T56_1C = _mm_unpackhi_epi8(T56C, zero);
    T56C = _mm_add_epi16(T56_0C, T56_1C);
    T56D = _mm_loadu_si128((__m128i *)(p_src + 56 * i_src + 48));
    T56_0D = _mm_unpacklo_epi8(T56D, zero);
    T56_1D = _mm_unpackhi_epi8(T56D, zero);
    T56D = _mm_add_epi16(T56_0D, T56_1D);
    T56 = _mm_add_epi16(T56A, T56B);
    T56 = _mm_add_epi16(T56, T56C);
    T56 = _mm_add_epi16(T56, T56D);

    T57A = _mm_loadu_si128((__m128i *)(p_src + 57 * i_src));
    T57_0A = _mm_unpacklo_epi8(T57A, zero);
    T57_1A = _mm_unpackhi_epi8(T57A, zero);
    T57A = _mm_add_epi16(T57_0A, T57_1A);
    T57B = _mm_loadu_si128((__m128i *)(p_src + 57 * i_src + 16));
    T57_0B = _mm_unpacklo_epi8(T57B, zero);
    T57_1B = _mm_unpackhi_epi8(T57B, zero);
    T57B = _mm_add_epi16(T57_0B, T57_1B);
    T57C = _mm_loadu_si128((__m128i *)(p_src + 57 * i_src + 32));
    T57_0C = _mm_unpacklo_epi8(T57C, zero);
    T57_1C = _mm_unpackhi_epi8(T57C, zero);
    T57C = _mm_add_epi16(T57_0C, T57_1C);
    T57D = _mm_loadu_si128((__m128i *)(p_src + 57 * i_src + 48));
    T57_0D = _mm_unpacklo_epi8(T57D, zero);
    T57_1D = _mm_unpackhi_epi8(T57D, zero);
    T57D = _mm_add_epi16(T57_0D, T57_1D);
    T57 = _mm_add_epi16(T57A, T57B);
    T57 = _mm_add_epi16(T57, T57C);
    T57 = _mm_add_epi16(T57, T57D);

    T58A = _mm_loadu_si128((__m128i *)(p_src + 58 * i_src));
    T58_0A = _mm_unpacklo_epi8(T58A, zero);
    T58_1A = _mm_unpackhi_epi8(T58A, zero);
    T58A = _mm_add_epi16(T58_0A, T58_1A);
    T58B = _mm_loadu_si128((__m128i *)(p_src + 58 * i_src + 16));
    T58_0B = _mm_unpacklo_epi8(T58B, zero);
    T58_1B = _mm_unpackhi_epi8(T58B, zero);
    T58B = _mm_add_epi16(T58_0B, T58_1B);
    T58C = _mm_loadu_si128((__m128i *)(p_src + 58 * i_src + 32));
    T58_0C = _mm_unpacklo_epi8(T58C, zero);
    T58_1C = _mm_unpackhi_epi8(T58C, zero);
    T58C = _mm_add_epi16(T58_0C, T58_1C);
    T58D = _mm_loadu_si128((__m128i *)(p_src + 58 * i_src + 48));
    T58_0D = _mm_unpacklo_epi8(T58D, zero);
    T58_1D = _mm_unpackhi_epi8(T58D, zero);
    T58D = _mm_add_epi16(T58_0D, T58_1D);
    T58 = _mm_add_epi16(T58A, T58B);
    T58 = _mm_add_epi16(T58, T58C);
    T58 = _mm_add_epi16(T58, T58D);

    T59A = _mm_loadu_si128((__m128i *)(p_src + 59 * i_src));
    T59_0A = _mm_unpacklo_epi8(T59A, zero);
    T59_1A = _mm_unpackhi_epi8(T59A, zero);
    T59A = _mm_add_epi16(T59_0A, T59_1A);
    T59B = _mm_loadu_si128((__m128i *)(p_src + 59 * i_src + 16));
    T59_0B = _mm_unpacklo_epi8(T59B, zero);
    T59_1B = _mm_unpackhi_epi8(T59B, zero);
    T59B = _mm_add_epi16(T59_0B, T59_1B);
    T59C = _mm_loadu_si128((__m128i *)(p_src + 59 * i_src + 32));
    T59_0C = _mm_unpacklo_epi8(T59C, zero);
    T59_1C = _mm_unpackhi_epi8(T59C, zero);
    T59C = _mm_add_epi16(T59_0C, T59_1C);
    T59D = _mm_loadu_si128((__m128i *)(p_src + 59 * i_src + 48));
    T59_0D = _mm_unpacklo_epi8(T59D, zero);
    T59_1D = _mm_unpackhi_epi8(T59D, zero);
    T59D = _mm_add_epi16(T59_0D, T59_1D);
    T59 = _mm_add_epi16(T59A, T59B);
    T59 = _mm_add_epi16(T59, T59C);
    T59 = _mm_add_epi16(T59, T59D);

    T60A = _mm_loadu_si128((__m128i *)(p_src + 60 * i_src));
    T60_0A = _mm_unpacklo_epi8(T60A, zero);
    T60_1A = _mm_unpackhi_epi8(T60A, zero);
    T60A = _mm_add_epi16(T60_0A, T60_1A);
    T60B = _mm_loadu_si128((__m128i *)(p_src + 60 * i_src + 16));
    T60_0B = _mm_unpacklo_epi8(T60B, zero);
    T60_1B = _mm_unpackhi_epi8(T60B, zero);
    T60B = _mm_add_epi16(T60_0B, T60_1B);
    T60C = _mm_loadu_si128((__m128i *)(p_src + 60 * i_src + 32));
    T60_0C = _mm_unpacklo_epi8(T60C, zero);
    T60_1C = _mm_unpackhi_epi8(T60C, zero);
    T60C = _mm_add_epi16(T60_0C, T60_1C);
    T60D = _mm_loadu_si128((__m128i *)(p_src + 60 * i_src + 48));
    T60_0D = _mm_unpacklo_epi8(T60D, zero);
    T60_1D = _mm_unpackhi_epi8(T60D, zero);
    T60D = _mm_add_epi16(T60_0D, T60_1D);
    T60 = _mm_add_epi16(T60A, T60B);
    T60 = _mm_add_epi16(T60, T60C);
    T60 = _mm_add_epi16(T60, T60D);

    T61A = _mm_loadu_si128((__m128i *)(p_src + 61 * i_src));
    T61_0A = _mm_unpacklo_epi8(T61A, zero);
    T61_1A = _mm_unpackhi_epi8(T61A, zero);
    T61A = _mm_add_epi16(T61_0A, T61_1A);
    T61B = _mm_loadu_si128((__m128i *)(p_src + 61 * i_src + 16));
    T61_0B = _mm_unpacklo_epi8(T61B, zero);
    T61_1B = _mm_unpackhi_epi8(T61B, zero);
    T61B = _mm_add_epi16(T61_0B, T61_1B);
    T61C = _mm_loadu_si128((__m128i *)(p_src + 61 * i_src + 32));
    T61_0C = _mm_unpacklo_epi8(T61C, zero);
    T61_1C = _mm_unpackhi_epi8(T61C, zero);
    T61C = _mm_add_epi16(T61_0C, T61_1C);
    T61D = _mm_loadu_si128((__m128i *)(p_src + 61 * i_src + 48));
    T61_0D = _mm_unpacklo_epi8(T61D, zero);
    T61_1D = _mm_unpackhi_epi8(T61D, zero);
    T61D = _mm_add_epi16(T61_0D, T61_1D);
    T61 = _mm_add_epi16(T61A, T61B);
    T61 = _mm_add_epi16(T61, T61C);
    T61 = _mm_add_epi16(T61, T61D);

    T62A = _mm_loadu_si128((__m128i *)(p_src + 62 * i_src));
    T62_0A = _mm_unpacklo_epi8(T62A, zero);
    T62_1A = _mm_unpackhi_epi8(T62A, zero);
    T62A = _mm_add_epi16(T62_0A, T62_1A);
    T62B = _mm_loadu_si128((__m128i *)(p_src + 62 * i_src + 16));
    T62_0B = _mm_unpacklo_epi8(T62B, zero);
    T62_1B = _mm_unpackhi_epi8(T62B, zero);
    T62B = _mm_add_epi16(T62_0B, T62_1B);
    T62C = _mm_loadu_si128((__m128i *)(p_src + 62 * i_src + 32));
    T62_0C = _mm_unpacklo_epi8(T62C, zero);
    T62_1C = _mm_unpackhi_epi8(T62C, zero);
    T62C = _mm_add_epi16(T62_0C, T62_1C);
    T62D = _mm_loadu_si128((__m128i *)(p_src + 62 * i_src + 48));
    T62_0D = _mm_unpacklo_epi8(T62D, zero);
    T62_1D = _mm_unpackhi_epi8(T62D, zero);
    T62D = _mm_add_epi16(T62_0D, T62_1D);
    T62 = _mm_add_epi16(T62A, T62B);
    T62 = _mm_add_epi16(T62, T62C);
    T62 = _mm_add_epi16(T62, T62D);

    T63A = _mm_loadu_si128((__m128i *)(p_src + 63 * i_src));
    T63_0A = _mm_unpacklo_epi8(T63A, zero);
    T63_1A = _mm_unpackhi_epi8(T63A, zero);
    T63A = _mm_add_epi16(T63_0A, T63_1A);
    T63B = _mm_loadu_si128((__m128i *)(p_src + 63 * i_src + 16));
    T63_0B = _mm_unpacklo_epi8(T63B, zero);
    T63_1B = _mm_unpackhi_epi8(T63B, zero);
    T63B = _mm_add_epi16(T63_0B, T63_1B);
    T63C = _mm_loadu_si128((__m128i *)(p_src + 63 * i_src + 32));
    T63_0C = _mm_unpacklo_epi8(T63C, zero);
    T63_1C = _mm_unpackhi_epi8(T63C, zero);
    T63C = _mm_add_epi16(T63_0C, T63_1C);
    T63D = _mm_loadu_si128((__m128i *)(p_src + 63 * i_src + 48));
    T63_0D = _mm_unpacklo_epi8(T63D, zero);
    T63_1D = _mm_unpackhi_epi8(T63D, zero);
    T63D = _mm_add_epi16(T63_0D, T63_1D);
    T63 = _mm_add_epi16(T63A, T63B);
    T63 = _mm_add_epi16(T63, T63C);
    T63 = _mm_add_epi16(T63, T63D);

    S1 = _mm_add_epi16(T0, T1);
    S1 = _mm_add_epi16(S1, T2);
    S1 = _mm_add_epi16(S1, T3);
    S1 = _mm_add_epi16(S1, T4);
    S1 = _mm_add_epi16(S1, T5);
    S1 = _mm_add_epi16(S1, T6);
    S1 = _mm_add_epi16(S1, T7);
    S1 = _mm_add_epi16(S1, T8);
    S1 = _mm_add_epi16(S1, T9);
    S1 = _mm_add_epi16(S1, T10);
    S1 = _mm_add_epi16(S1, T11);
    S1 = _mm_add_epi16(S1, T12);
    S1 = _mm_add_epi16(S1, T13);
    S1 = _mm_add_epi16(S1, T14);
    S1 = _mm_add_epi16(S1, T15);

    S2 = _mm_add_epi16(T16, T17);
    S2 = _mm_add_epi16(S2, T18);
    S2 = _mm_add_epi16(S2, T19);
    S2 = _mm_add_epi16(S2, T20);
    S2 = _mm_add_epi16(S2, T21);
    S2 = _mm_add_epi16(S2, T22);
    S2 = _mm_add_epi16(S2, T23);
    S2 = _mm_add_epi16(S2, T24);
    S2 = _mm_add_epi16(S2, T25);
    S2 = _mm_add_epi16(S2, T26);
    S2 = _mm_add_epi16(S2, T27);
    S2 = _mm_add_epi16(S2, T28);
    S2 = _mm_add_epi16(S2, T29);
    S2 = _mm_add_epi16(S2, T30);
    S2 = _mm_add_epi16(S2, T31);

    S3 = _mm_add_epi16(T32, T33);
    S3 = _mm_add_epi16(S3, T34);
    S3 = _mm_add_epi16(S3, T35);
    S3 = _mm_add_epi16(S3, T36);
    S3 = _mm_add_epi16(S3, T37);
    S3 = _mm_add_epi16(S3, T38);
    S3 = _mm_add_epi16(S3, T39);
    S3 = _mm_add_epi16(S3, T40);
    S3 = _mm_add_epi16(S3, T41);
    S3 = _mm_add_epi16(S3, T42);
    S3 = _mm_add_epi16(S3, T43);
    S3 = _mm_add_epi16(S3, T44);
    S3 = _mm_add_epi16(S3, T45);
    S3 = _mm_add_epi16(S3, T46);
    S3 = _mm_add_epi16(S3, T47);

    S = _mm_add_epi16(T48, T49);
    S = _mm_add_epi16(S, T50);
    S = _mm_add_epi16(S, T51);
    S = _mm_add_epi16(S, T52);
    S = _mm_add_epi16(S, T53);
    S = _mm_add_epi16(S, T54);
    S = _mm_add_epi16(S, T55);
    S = _mm_add_epi16(S, T56);
    S = _mm_add_epi16(S, T57);
    S = _mm_add_epi16(S, T58);
    S = _mm_add_epi16(S, T59);
    S = _mm_add_epi16(S, T60);
    S = _mm_add_epi16(S, T61);
    S = _mm_add_epi16(S, T62);
    S = _mm_add_epi16(S, T63);

    sum1 = M128_U16(S1, 0) + M128_U16(S1, 1) + M128_U16(S1, 2) + M128_U16(S1, 3) + M128_U16(S1, 4) + M128_U16(S1, 5) + M128_U16(S1, 6) + M128_U16(S1, 7);
    sum2 = M128_U16(S2, 0) + M128_U16(S2, 1) + M128_U16(S2, 2) + M128_U16(S2, 3) + M128_U16(S2, 4) + M128_U16(S2, 5) + M128_U16(S2, 6) + M128_U16(S2, 7);
    sum3 = M128_U16(S3, 0) + M128_U16(S3, 1) + M128_U16(S3, 2) + M128_U16(S3, 3) + M128_U16(S3, 4) + M128_U16(S3, 5) + M128_U16(S3, 6) + M128_U16(S3, 7);
    sum = M128_U16(S, 0) + M128_U16(S, 1) + M128_U16(S, 2) + M128_U16(S, 3) + M128_U16(S, 4) + M128_U16(S, 5) + M128_U16(S, 6) + M128_U16(S, 7);
    sum = sum + sum1 + sum2 + sum3;
    f_avg = (sum + (num_pix >> 1)) / num_pix;

    avg = _mm_set1_epi16((short)f_avg);

    /* cal mad */
    /*for (int y = 0; y < cu_size; ++y) {
    for (int x = 0; x < cu_size; ++x) {
    int f_pxl = p_src[x];
    mad += AVS2_ABS(f_pxl - f_avg);
    }
    p_src += i_src;
    }*/
    T0_0A = _mm_sub_epi16(T0_0A, avg);
    T0_1A = _mm_sub_epi16(T0_1A, avg);
    T0_0B = _mm_sub_epi16(T0_0B, avg);
    T0_1B = _mm_sub_epi16(T0_1B, avg);
    T0_0C = _mm_sub_epi16(T0_0C, avg);
    T0_1C = _mm_sub_epi16(T0_1C, avg);
    T0_0D = _mm_sub_epi16(T0_0D, avg);
    T0_1D = _mm_sub_epi16(T0_1D, avg);
    T1_0A = _mm_sub_epi16(T1_0A, avg);
    T1_1A = _mm_sub_epi16(T1_1A, avg);
    T1_0B = _mm_sub_epi16(T1_0B, avg);
    T1_1B = _mm_sub_epi16(T1_1B, avg);
    T1_0C = _mm_sub_epi16(T1_0C, avg);
    T1_1C = _mm_sub_epi16(T1_1C, avg);
    T1_0D = _mm_sub_epi16(T1_0D, avg);
    T1_1D = _mm_sub_epi16(T1_1D, avg);
    T2_0A = _mm_sub_epi16(T2_0A, avg);
    T2_1A = _mm_sub_epi16(T2_1A, avg);
    T2_0B = _mm_sub_epi16(T2_0B, avg);
    T2_1B = _mm_sub_epi16(T2_1B, avg);
    T2_0C = _mm_sub_epi16(T2_0C, avg);
    T2_1C = _mm_sub_epi16(T2_1C, avg);
    T2_0D = _mm_sub_epi16(T2_0D, avg);
    T2_1D = _mm_sub_epi16(T2_1D, avg);
    T3_0A = _mm_sub_epi16(T3_0A, avg);
    T3_1A = _mm_sub_epi16(T3_1A, avg);
    T3_0B = _mm_sub_epi16(T3_0B, avg);
    T3_1B = _mm_sub_epi16(T3_1B, avg);
    T3_0C = _mm_sub_epi16(T3_0C, avg);
    T3_1C = _mm_sub_epi16(T3_1C, avg);
    T3_0D = _mm_sub_epi16(T3_0D, avg);
    T3_1D = _mm_sub_epi16(T3_1D, avg);
    T4_0A = _mm_sub_epi16(T4_0A, avg);
    T4_1A = _mm_sub_epi16(T4_1A, avg);
    T4_0B = _mm_sub_epi16(T4_0B, avg);
    T4_1B = _mm_sub_epi16(T4_1B, avg);
    T4_0C = _mm_sub_epi16(T4_0C, avg);
    T4_1C = _mm_sub_epi16(T4_1C, avg);
    T4_0D = _mm_sub_epi16(T4_0D, avg);
    T4_1D = _mm_sub_epi16(T4_1D, avg);
    T5_0A = _mm_sub_epi16(T5_0A, avg);
    T5_1A = _mm_sub_epi16(T5_1A, avg);
    T5_0B = _mm_sub_epi16(T5_0B, avg);
    T5_1B = _mm_sub_epi16(T5_1B, avg);
    T5_0C = _mm_sub_epi16(T5_0C, avg);
    T5_1C = _mm_sub_epi16(T5_1C, avg);
    T5_0D = _mm_sub_epi16(T5_0D, avg);
    T5_1D = _mm_sub_epi16(T5_1D, avg);
    T6_0A = _mm_sub_epi16(T6_0A, avg);
    T6_1A = _mm_sub_epi16(T6_1A, avg);
    T6_0B = _mm_sub_epi16(T6_0B, avg);
    T6_1B = _mm_sub_epi16(T6_1B, avg);
    T6_0C = _mm_sub_epi16(T6_0C, avg);
    T6_1C = _mm_sub_epi16(T6_1C, avg);
    T6_0D = _mm_sub_epi16(T6_0D, avg);
    T6_1D = _mm_sub_epi16(T6_1D, avg);
    T7_0A = _mm_sub_epi16(T7_0A, avg);
    T7_1A = _mm_sub_epi16(T7_1A, avg);
    T7_0B = _mm_sub_epi16(T7_0B, avg);
    T7_1B = _mm_sub_epi16(T7_1B, avg);
    T7_0C = _mm_sub_epi16(T7_0C, avg);
    T7_1C = _mm_sub_epi16(T7_1C, avg);
    T7_0D = _mm_sub_epi16(T7_0D, avg);
    T7_1D = _mm_sub_epi16(T7_1D, avg);
    T8_0A = _mm_sub_epi16(T8_0A, avg);
    T8_1A = _mm_sub_epi16(T8_1A, avg);
    T8_0B = _mm_sub_epi16(T8_0B, avg);
    T8_1B = _mm_sub_epi16(T8_1B, avg);
    T8_0C = _mm_sub_epi16(T8_0C, avg);
    T8_1C = _mm_sub_epi16(T8_1C, avg);
    T8_0D = _mm_sub_epi16(T8_0D, avg);
    T8_1D = _mm_sub_epi16(T8_1D, avg);
    T9_0A = _mm_sub_epi16(T9_0A, avg);
    T9_1A = _mm_sub_epi16(T9_1A, avg);
    T9_0B = _mm_sub_epi16(T9_0B, avg);
    T9_1B = _mm_sub_epi16(T9_1B, avg);
    T9_0C = _mm_sub_epi16(T9_0C, avg);
    T9_1C = _mm_sub_epi16(T9_1C, avg);
    T9_0D = _mm_sub_epi16(T9_0D, avg);
    T9_1D = _mm_sub_epi16(T9_1D, avg);
    T10_0A = _mm_sub_epi16(T10_0A, avg);
    T10_1A = _mm_sub_epi16(T10_1A, avg);
    T10_0B = _mm_sub_epi16(T10_0B, avg);
    T10_1B = _mm_sub_epi16(T10_1B, avg);
    T10_0C = _mm_sub_epi16(T10_0C, avg);
    T10_1C = _mm_sub_epi16(T10_1C, avg);
    T10_0D = _mm_sub_epi16(T10_0D, avg);
    T10_1D = _mm_sub_epi16(T10_1D, avg);
    T11_0A = _mm_sub_epi16(T11_0A, avg);
    T11_1A = _mm_sub_epi16(T11_1A, avg);
    T11_0B = _mm_sub_epi16(T11_0B, avg);
    T11_1B = _mm_sub_epi16(T11_1B, avg);
    T11_0C = _mm_sub_epi16(T11_0C, avg);
    T11_1C = _mm_sub_epi16(T11_1C, avg);
    T11_0D = _mm_sub_epi16(T11_0D, avg);
    T11_1D = _mm_sub_epi16(T11_1D, avg);
    T12_0A = _mm_sub_epi16(T12_0A, avg);
    T12_1A = _mm_sub_epi16(T12_1A, avg);
    T12_0B = _mm_sub_epi16(T12_0B, avg);
    T12_1B = _mm_sub_epi16(T12_1B, avg);
    T12_0C = _mm_sub_epi16(T12_0C, avg);
    T12_1C = _mm_sub_epi16(T12_1C, avg);
    T12_0D = _mm_sub_epi16(T12_0D, avg);
    T12_1D = _mm_sub_epi16(T12_1D, avg);
    T13_0A = _mm_sub_epi16(T13_0A, avg);
    T13_1A = _mm_sub_epi16(T13_1A, avg);
    T13_0B = _mm_sub_epi16(T13_0B, avg);
    T13_1B = _mm_sub_epi16(T13_1B, avg);
    T13_0C = _mm_sub_epi16(T13_0C, avg);
    T13_1C = _mm_sub_epi16(T13_1C, avg);
    T13_0D = _mm_sub_epi16(T13_0D, avg);
    T13_1D = _mm_sub_epi16(T13_1D, avg);
    T14_0A = _mm_sub_epi16(T14_0A, avg);
    T14_1A = _mm_sub_epi16(T14_1A, avg);
    T14_0B = _mm_sub_epi16(T14_0B, avg);
    T14_1B = _mm_sub_epi16(T14_1B, avg);
    T14_0C = _mm_sub_epi16(T14_0C, avg);
    T14_1C = _mm_sub_epi16(T14_1C, avg);
    T14_0D = _mm_sub_epi16(T14_0D, avg);
    T14_1D = _mm_sub_epi16(T14_1D, avg);
    T15_0A = _mm_sub_epi16(T15_0A, avg);
    T15_1A = _mm_sub_epi16(T15_1A, avg);
    T15_0B = _mm_sub_epi16(T15_0B, avg);
    T15_1B = _mm_sub_epi16(T15_1B, avg);
    T15_0C = _mm_sub_epi16(T15_0C, avg);
    T15_1C = _mm_sub_epi16(T15_1C, avg);
    T15_0D = _mm_sub_epi16(T15_0D, avg);
    T15_1D = _mm_sub_epi16(T15_1D, avg);
    T16_0A = _mm_sub_epi16(T16_0A, avg);
    T16_1A = _mm_sub_epi16(T16_1A, avg);
    T16_0B = _mm_sub_epi16(T16_0B, avg);
    T16_1B = _mm_sub_epi16(T16_1B, avg);
    T16_0C = _mm_sub_epi16(T16_0C, avg);
    T16_1C = _mm_sub_epi16(T16_1C, avg);
    T16_0D = _mm_sub_epi16(T16_0D, avg);
    T16_1D = _mm_sub_epi16(T16_1D, avg);
    T17_0A = _mm_sub_epi16(T17_0A, avg);
    T17_1A = _mm_sub_epi16(T17_1A, avg);
    T17_0B = _mm_sub_epi16(T17_0B, avg);
    T17_1B = _mm_sub_epi16(T17_1B, avg);
    T17_0C = _mm_sub_epi16(T17_0C, avg);
    T17_1C = _mm_sub_epi16(T17_1C, avg);
    T17_0D = _mm_sub_epi16(T17_0D, avg);
    T17_1D = _mm_sub_epi16(T17_1D, avg);
    T18_0A = _mm_sub_epi16(T18_0A, avg);
    T18_1A = _mm_sub_epi16(T18_1A, avg);
    T18_0B = _mm_sub_epi16(T18_0B, avg);
    T18_1B = _mm_sub_epi16(T18_1B, avg);
    T18_0C = _mm_sub_epi16(T18_0C, avg);
    T18_1C = _mm_sub_epi16(T18_1C, avg);
    T18_0D = _mm_sub_epi16(T18_0D, avg);
    T18_1D = _mm_sub_epi16(T18_1D, avg);
    T19_0A = _mm_sub_epi16(T19_0A, avg);
    T19_1A = _mm_sub_epi16(T19_1A, avg);
    T19_0B = _mm_sub_epi16(T19_0B, avg);
    T19_1B = _mm_sub_epi16(T19_1B, avg);
    T19_0C = _mm_sub_epi16(T19_0C, avg);
    T19_1C = _mm_sub_epi16(T19_1C, avg);
    T19_0D = _mm_sub_epi16(T19_0D, avg);
    T19_1D = _mm_sub_epi16(T19_1D, avg);
    T20_0A = _mm_sub_epi16(T20_0A, avg);
    T20_1A = _mm_sub_epi16(T20_1A, avg);
    T20_0B = _mm_sub_epi16(T20_0B, avg);
    T20_1B = _mm_sub_epi16(T20_1B, avg);
    T20_0C = _mm_sub_epi16(T20_0C, avg);
    T20_1C = _mm_sub_epi16(T20_1C, avg);
    T20_0D = _mm_sub_epi16(T20_0D, avg);
    T20_1D = _mm_sub_epi16(T20_1D, avg);
    T21_0A = _mm_sub_epi16(T21_0A, avg);
    T21_1A = _mm_sub_epi16(T21_1A, avg);
    T21_0B = _mm_sub_epi16(T21_0B, avg);
    T21_1B = _mm_sub_epi16(T21_1B, avg);
    T21_0C = _mm_sub_epi16(T21_0C, avg);
    T21_1C = _mm_sub_epi16(T21_1C, avg);
    T21_0D = _mm_sub_epi16(T21_0D, avg);
    T21_1D = _mm_sub_epi16(T21_1D, avg);
    T22_0A = _mm_sub_epi16(T22_0A, avg);
    T22_1A = _mm_sub_epi16(T22_1A, avg);
    T22_0B = _mm_sub_epi16(T22_0B, avg);
    T22_1B = _mm_sub_epi16(T22_1B, avg);
    T22_0C = _mm_sub_epi16(T22_0C, avg);
    T22_1C = _mm_sub_epi16(T22_1C, avg);
    T22_0D = _mm_sub_epi16(T22_0D, avg);
    T22_1D = _mm_sub_epi16(T22_1D, avg);
    T23_0A = _mm_sub_epi16(T23_0A, avg);
    T23_1A = _mm_sub_epi16(T23_1A, avg);
    T23_0B = _mm_sub_epi16(T23_0B, avg);
    T23_1B = _mm_sub_epi16(T23_1B, avg);
    T23_0C = _mm_sub_epi16(T23_0C, avg);
    T23_1C = _mm_sub_epi16(T23_1C, avg);
    T23_0D = _mm_sub_epi16(T23_0D, avg);
    T23_1D = _mm_sub_epi16(T23_1D, avg);
    T24_0A = _mm_sub_epi16(T24_0A, avg);
    T24_1A = _mm_sub_epi16(T24_1A, avg);
    T24_0B = _mm_sub_epi16(T24_0B, avg);
    T24_1B = _mm_sub_epi16(T24_1B, avg);
    T24_0C = _mm_sub_epi16(T24_0C, avg);
    T24_1C = _mm_sub_epi16(T24_1C, avg);
    T24_0D = _mm_sub_epi16(T24_0D, avg);
    T24_1D = _mm_sub_epi16(T24_1D, avg);
    T25_0A = _mm_sub_epi16(T25_0A, avg);
    T25_1A = _mm_sub_epi16(T25_1A, avg);
    T25_0B = _mm_sub_epi16(T25_0B, avg);
    T25_1B = _mm_sub_epi16(T25_1B, avg);
    T25_0C = _mm_sub_epi16(T25_0C, avg);
    T25_1C = _mm_sub_epi16(T25_1C, avg);
    T25_0D = _mm_sub_epi16(T25_0D, avg);
    T25_1D = _mm_sub_epi16(T25_1D, avg);
    T26_0A = _mm_sub_epi16(T26_0A, avg);
    T26_1A = _mm_sub_epi16(T26_1A, avg);
    T26_0B = _mm_sub_epi16(T26_0B, avg);
    T26_1B = _mm_sub_epi16(T26_1B, avg);
    T26_0C = _mm_sub_epi16(T26_0C, avg);
    T26_1C = _mm_sub_epi16(T26_1C, avg);
    T26_0D = _mm_sub_epi16(T26_0D, avg);
    T26_1D = _mm_sub_epi16(T26_1D, avg);
    T27_0A = _mm_sub_epi16(T27_0A, avg);
    T27_1A = _mm_sub_epi16(T27_1A, avg);
    T27_0B = _mm_sub_epi16(T27_0B, avg);
    T27_1B = _mm_sub_epi16(T27_1B, avg);
    T27_0C = _mm_sub_epi16(T27_0C, avg);
    T27_1C = _mm_sub_epi16(T27_1C, avg);
    T27_0D = _mm_sub_epi16(T27_0D, avg);
    T27_1D = _mm_sub_epi16(T27_1D, avg);
    T28_0A = _mm_sub_epi16(T28_0A, avg);
    T28_1A = _mm_sub_epi16(T28_1A, avg);
    T28_0B = _mm_sub_epi16(T28_0B, avg);
    T28_1B = _mm_sub_epi16(T28_1B, avg);
    T28_0C = _mm_sub_epi16(T28_0C, avg);
    T28_1C = _mm_sub_epi16(T28_1C, avg);
    T28_0D = _mm_sub_epi16(T28_0D, avg);
    T28_1D = _mm_sub_epi16(T28_1D, avg);
    T29_0A = _mm_sub_epi16(T29_0A, avg);
    T29_1A = _mm_sub_epi16(T29_1A, avg);
    T29_0B = _mm_sub_epi16(T29_0B, avg);
    T29_1B = _mm_sub_epi16(T29_1B, avg);
    T29_0C = _mm_sub_epi16(T29_0C, avg);
    T29_1C = _mm_sub_epi16(T29_1C, avg);
    T29_0D = _mm_sub_epi16(T29_0D, avg);
    T29_1D = _mm_sub_epi16(T29_1D, avg);
    T30_0A = _mm_sub_epi16(T30_0A, avg);
    T30_1A = _mm_sub_epi16(T30_1A, avg);
    T30_0B = _mm_sub_epi16(T30_0B, avg);
    T30_1B = _mm_sub_epi16(T30_1B, avg);
    T30_0C = _mm_sub_epi16(T30_0C, avg);
    T30_1C = _mm_sub_epi16(T30_1C, avg);
    T30_0D = _mm_sub_epi16(T30_0D, avg);
    T30_1D = _mm_sub_epi16(T30_1D, avg);
    T31_0A = _mm_sub_epi16(T31_0A, avg);
    T31_1A = _mm_sub_epi16(T31_1A, avg);
    T31_0B = _mm_sub_epi16(T31_0B, avg);
    T31_1B = _mm_sub_epi16(T31_1B, avg);
    T31_0C = _mm_sub_epi16(T31_0C, avg);
    T31_1C = _mm_sub_epi16(T31_1C, avg);
    T31_0D = _mm_sub_epi16(T31_0D, avg);
    T31_1D = _mm_sub_epi16(T31_1D, avg);
    T32_0A = _mm_sub_epi16(T32_0A, avg);
    T32_1A = _mm_sub_epi16(T32_1A, avg);
    T32_0B = _mm_sub_epi16(T32_0B, avg);
    T32_1B = _mm_sub_epi16(T32_1B, avg);
    T32_0C = _mm_sub_epi16(T32_0C, avg);
    T32_1C = _mm_sub_epi16(T32_1C, avg);
    T32_0D = _mm_sub_epi16(T32_0D, avg);
    T32_1D = _mm_sub_epi16(T32_1D, avg);
    T33_0A = _mm_sub_epi16(T33_0A, avg);
    T33_1A = _mm_sub_epi16(T33_1A, avg);
    T33_0B = _mm_sub_epi16(T33_0B, avg);
    T33_1B = _mm_sub_epi16(T33_1B, avg);
    T33_0C = _mm_sub_epi16(T33_0C, avg);
    T33_1C = _mm_sub_epi16(T33_1C, avg);
    T33_0D = _mm_sub_epi16(T33_0D, avg);
    T33_1D = _mm_sub_epi16(T33_1D, avg);
    T34_0A = _mm_sub_epi16(T34_0A, avg);
    T34_1A = _mm_sub_epi16(T34_1A, avg);
    T34_0B = _mm_sub_epi16(T34_0B, avg);
    T34_1B = _mm_sub_epi16(T34_1B, avg);
    T34_0C = _mm_sub_epi16(T34_0C, avg);
    T34_1C = _mm_sub_epi16(T34_1C, avg);
    T34_0D = _mm_sub_epi16(T34_0D, avg);
    T34_1D = _mm_sub_epi16(T34_1D, avg);
    T35_0A = _mm_sub_epi16(T35_0A, avg);
    T35_1A = _mm_sub_epi16(T35_1A, avg);
    T35_0B = _mm_sub_epi16(T35_0B, avg);
    T35_1B = _mm_sub_epi16(T35_1B, avg);
    T35_0C = _mm_sub_epi16(T35_0C, avg);
    T35_1C = _mm_sub_epi16(T35_1C, avg);
    T35_0D = _mm_sub_epi16(T35_0D, avg);
    T35_1D = _mm_sub_epi16(T35_1D, avg);
    T36_0A = _mm_sub_epi16(T36_0A, avg);
    T36_1A = _mm_sub_epi16(T36_1A, avg);
    T36_0B = _mm_sub_epi16(T36_0B, avg);
    T36_1B = _mm_sub_epi16(T36_1B, avg);
    T36_0C = _mm_sub_epi16(T36_0C, avg);
    T36_1C = _mm_sub_epi16(T36_1C, avg);
    T36_0D = _mm_sub_epi16(T36_0D, avg);
    T36_1D = _mm_sub_epi16(T36_1D, avg);
    T37_0A = _mm_sub_epi16(T37_0A, avg);
    T37_1A = _mm_sub_epi16(T37_1A, avg);
    T37_0B = _mm_sub_epi16(T37_0B, avg);
    T37_1B = _mm_sub_epi16(T37_1B, avg);
    T37_0C = _mm_sub_epi16(T37_0C, avg);
    T37_1C = _mm_sub_epi16(T37_1C, avg);
    T37_0D = _mm_sub_epi16(T37_0D, avg);
    T37_1D = _mm_sub_epi16(T37_1D, avg);
    T38_0A = _mm_sub_epi16(T38_0A, avg);
    T38_1A = _mm_sub_epi16(T38_1A, avg);
    T38_0B = _mm_sub_epi16(T38_0B, avg);
    T38_1B = _mm_sub_epi16(T38_1B, avg);
    T38_0C = _mm_sub_epi16(T38_0C, avg);
    T38_1C = _mm_sub_epi16(T38_1C, avg);
    T38_0D = _mm_sub_epi16(T38_0D, avg);
    T38_1D = _mm_sub_epi16(T38_1D, avg);
    T39_0A = _mm_sub_epi16(T39_0A, avg);
    T39_1A = _mm_sub_epi16(T39_1A, avg);
    T39_0B = _mm_sub_epi16(T39_0B, avg);
    T39_1B = _mm_sub_epi16(T39_1B, avg);
    T39_0C = _mm_sub_epi16(T39_0C, avg);
    T39_1C = _mm_sub_epi16(T39_1C, avg);
    T39_0D = _mm_sub_epi16(T39_0D, avg);
    T39_1D = _mm_sub_epi16(T39_1D, avg);
    T40_0A = _mm_sub_epi16(T40_0A, avg);
    T40_1A = _mm_sub_epi16(T40_1A, avg);
    T40_0B = _mm_sub_epi16(T40_0B, avg);
    T40_1B = _mm_sub_epi16(T40_1B, avg);
    T40_0C = _mm_sub_epi16(T40_0C, avg);
    T40_1C = _mm_sub_epi16(T40_1C, avg);
    T40_0D = _mm_sub_epi16(T40_0D, avg);
    T40_1D = _mm_sub_epi16(T40_1D, avg);
    T41_0A = _mm_sub_epi16(T41_0A, avg);
    T41_1A = _mm_sub_epi16(T41_1A, avg);
    T41_0B = _mm_sub_epi16(T41_0B, avg);
    T41_1B = _mm_sub_epi16(T41_1B, avg);
    T41_0C = _mm_sub_epi16(T41_0C, avg);
    T41_1C = _mm_sub_epi16(T41_1C, avg);
    T41_0D = _mm_sub_epi16(T41_0D, avg);
    T41_1D = _mm_sub_epi16(T41_1D, avg);
    T42_0A = _mm_sub_epi16(T42_0A, avg);
    T42_1A = _mm_sub_epi16(T42_1A, avg);
    T42_0B = _mm_sub_epi16(T42_0B, avg);
    T42_1B = _mm_sub_epi16(T42_1B, avg);
    T42_0C = _mm_sub_epi16(T42_0C, avg);
    T42_1C = _mm_sub_epi16(T42_1C, avg);
    T42_0D = _mm_sub_epi16(T42_0D, avg);
    T42_1D = _mm_sub_epi16(T42_1D, avg);
    T43_0A = _mm_sub_epi16(T43_0A, avg);
    T43_1A = _mm_sub_epi16(T43_1A, avg);
    T43_0B = _mm_sub_epi16(T43_0B, avg);
    T43_1B = _mm_sub_epi16(T43_1B, avg);
    T43_0C = _mm_sub_epi16(T43_0C, avg);
    T43_1C = _mm_sub_epi16(T43_1C, avg);
    T43_0D = _mm_sub_epi16(T43_0D, avg);
    T43_1D = _mm_sub_epi16(T43_1D, avg);
    T44_0A = _mm_sub_epi16(T44_0A, avg);
    T44_1A = _mm_sub_epi16(T44_1A, avg);
    T44_0B = _mm_sub_epi16(T44_0B, avg);
    T44_1B = _mm_sub_epi16(T44_1B, avg);
    T44_0C = _mm_sub_epi16(T44_0C, avg);
    T44_1C = _mm_sub_epi16(T44_1C, avg);
    T44_0D = _mm_sub_epi16(T44_0D, avg);
    T44_1D = _mm_sub_epi16(T44_1D, avg);
    T45_0A = _mm_sub_epi16(T45_0A, avg);
    T45_1A = _mm_sub_epi16(T45_1A, avg);
    T45_0B = _mm_sub_epi16(T45_0B, avg);
    T45_1B = _mm_sub_epi16(T45_1B, avg);
    T45_0C = _mm_sub_epi16(T45_0C, avg);
    T45_1C = _mm_sub_epi16(T45_1C, avg);
    T45_0D = _mm_sub_epi16(T45_0D, avg);
    T45_1D = _mm_sub_epi16(T45_1D, avg);
    T46_0A = _mm_sub_epi16(T46_0A, avg);
    T46_1A = _mm_sub_epi16(T46_1A, avg);
    T46_0B = _mm_sub_epi16(T46_0B, avg);
    T46_1B = _mm_sub_epi16(T46_1B, avg);
    T46_0C = _mm_sub_epi16(T46_0C, avg);
    T46_1C = _mm_sub_epi16(T46_1C, avg);
    T46_0D = _mm_sub_epi16(T46_0D, avg);
    T46_1D = _mm_sub_epi16(T46_1D, avg);
    T47_0A = _mm_sub_epi16(T47_0A, avg);
    T47_1A = _mm_sub_epi16(T47_1A, avg);
    T47_0B = _mm_sub_epi16(T47_0B, avg);
    T47_1B = _mm_sub_epi16(T47_1B, avg);
    T47_0C = _mm_sub_epi16(T47_0C, avg);
    T47_1C = _mm_sub_epi16(T47_1C, avg);
    T47_0D = _mm_sub_epi16(T47_0D, avg);
    T47_1D = _mm_sub_epi16(T47_1D, avg);
    T48_0A = _mm_sub_epi16(T48_0A, avg);
    T48_1A = _mm_sub_epi16(T48_1A, avg);
    T48_0B = _mm_sub_epi16(T48_0B, avg);
    T48_1B = _mm_sub_epi16(T48_1B, avg);
    T48_0C = _mm_sub_epi16(T48_0C, avg);
    T48_1C = _mm_sub_epi16(T48_1C, avg);
    T48_0D = _mm_sub_epi16(T48_0D, avg);
    T48_1D = _mm_sub_epi16(T48_1D, avg);
    T49_0A = _mm_sub_epi16(T49_0A, avg);
    T49_1A = _mm_sub_epi16(T49_1A, avg);
    T49_0B = _mm_sub_epi16(T49_0B, avg);
    T49_1B = _mm_sub_epi16(T49_1B, avg);
    T49_0C = _mm_sub_epi16(T49_0C, avg);
    T49_1C = _mm_sub_epi16(T49_1C, avg);
    T49_0D = _mm_sub_epi16(T49_0D, avg);
    T49_1D = _mm_sub_epi16(T49_1D, avg);
    T50_0A = _mm_sub_epi16(T50_0A, avg);
    T50_1A = _mm_sub_epi16(T50_1A, avg);
    T50_0B = _mm_sub_epi16(T50_0B, avg);
    T50_1B = _mm_sub_epi16(T50_1B, avg);
    T50_0C = _mm_sub_epi16(T50_0C, avg);
    T50_1C = _mm_sub_epi16(T50_1C, avg);
    T50_0D = _mm_sub_epi16(T50_0D, avg);
    T50_1D = _mm_sub_epi16(T50_1D, avg);
    T51_0A = _mm_sub_epi16(T51_0A, avg);
    T51_1A = _mm_sub_epi16(T51_1A, avg);
    T51_0B = _mm_sub_epi16(T51_0B, avg);
    T51_1B = _mm_sub_epi16(T51_1B, avg);
    T51_0C = _mm_sub_epi16(T51_0C, avg);
    T51_1C = _mm_sub_epi16(T51_1C, avg);
    T51_0D = _mm_sub_epi16(T51_0D, avg);
    T51_1D = _mm_sub_epi16(T51_1D, avg);
    T52_0A = _mm_sub_epi16(T52_0A, avg);
    T52_1A = _mm_sub_epi16(T52_1A, avg);
    T52_0B = _mm_sub_epi16(T52_0B, avg);
    T52_1B = _mm_sub_epi16(T52_1B, avg);
    T52_0C = _mm_sub_epi16(T52_0C, avg);
    T52_1C = _mm_sub_epi16(T52_1C, avg);
    T52_0D = _mm_sub_epi16(T52_0D, avg);
    T52_1D = _mm_sub_epi16(T52_1D, avg);
    T53_0A = _mm_sub_epi16(T53_0A, avg);
    T53_1A = _mm_sub_epi16(T53_1A, avg);
    T53_0B = _mm_sub_epi16(T53_0B, avg);
    T53_1B = _mm_sub_epi16(T53_1B, avg);
    T53_0C = _mm_sub_epi16(T53_0C, avg);
    T53_1C = _mm_sub_epi16(T53_1C, avg);
    T53_0D = _mm_sub_epi16(T53_0D, avg);
    T53_1D = _mm_sub_epi16(T53_1D, avg);

    T54_0A = _mm_sub_epi16(T54_0A, avg);
    T54_1A = _mm_sub_epi16(T54_1A, avg);
    T54_0B = _mm_sub_epi16(T54_0B, avg);
    T54_1B = _mm_sub_epi16(T54_1B, avg);
    T54_0C = _mm_sub_epi16(T54_0C, avg);
    T54_1C = _mm_sub_epi16(T54_1C, avg);
    T54_0D = _mm_sub_epi16(T54_0D, avg);
    T54_1D = _mm_sub_epi16(T54_1D, avg);
    T55_0A = _mm_sub_epi16(T55_0A, avg);
    T55_1A = _mm_sub_epi16(T55_1A, avg);
    T55_0B = _mm_sub_epi16(T55_0B, avg);
    T55_1B = _mm_sub_epi16(T55_1B, avg);
    T55_0C = _mm_sub_epi16(T55_0C, avg);
    T55_1C = _mm_sub_epi16(T55_1C, avg);
    T55_0D = _mm_sub_epi16(T55_0D, avg);
    T55_1D = _mm_sub_epi16(T55_1D, avg);
    T56_0A = _mm_sub_epi16(T56_0A, avg);
    T56_1A = _mm_sub_epi16(T56_1A, avg);
    T56_0B = _mm_sub_epi16(T56_0B, avg);
    T56_1B = _mm_sub_epi16(T56_1B, avg);
    T56_0C = _mm_sub_epi16(T56_0C, avg);
    T56_1C = _mm_sub_epi16(T56_1C, avg);
    T56_0D = _mm_sub_epi16(T56_0D, avg);
    T56_1D = _mm_sub_epi16(T56_1D, avg);
    T57_0A = _mm_sub_epi16(T57_0A, avg);
    T57_1A = _mm_sub_epi16(T57_1A, avg);
    T57_0B = _mm_sub_epi16(T57_0B, avg);
    T57_1B = _mm_sub_epi16(T57_1B, avg);
    T57_0C = _mm_sub_epi16(T57_0C, avg);
    T57_1C = _mm_sub_epi16(T57_1C, avg);
    T57_0D = _mm_sub_epi16(T57_0D, avg);
    T57_1D = _mm_sub_epi16(T57_1D, avg);
    T58_0A = _mm_sub_epi16(T58_0A, avg);
    T58_1A = _mm_sub_epi16(T58_1A, avg);
    T58_0B = _mm_sub_epi16(T58_0B, avg);
    T58_1B = _mm_sub_epi16(T58_1B, avg);
    T58_0C = _mm_sub_epi16(T58_0C, avg);
    T58_1C = _mm_sub_epi16(T58_1C, avg);
    T58_0D = _mm_sub_epi16(T58_0D, avg);
    T58_1D = _mm_sub_epi16(T58_1D, avg);
    T59_0A = _mm_sub_epi16(T59_0A, avg);
    T59_1A = _mm_sub_epi16(T59_1A, avg);
    T59_0B = _mm_sub_epi16(T59_0B, avg);
    T59_1B = _mm_sub_epi16(T59_1B, avg);
    T59_0C = _mm_sub_epi16(T59_0C, avg);
    T59_1C = _mm_sub_epi16(T59_1C, avg);
    T59_0D = _mm_sub_epi16(T59_0D, avg);
    T59_1D = _mm_sub_epi16(T59_1D, avg);
    T60_0A = _mm_sub_epi16(T60_0A, avg);
    T60_1A = _mm_sub_epi16(T60_1A, avg);
    T60_0B = _mm_sub_epi16(T60_0B, avg);
    T60_1B = _mm_sub_epi16(T60_1B, avg);
    T60_0C = _mm_sub_epi16(T60_0C, avg);
    T60_1C = _mm_sub_epi16(T60_1C, avg);
    T60_0D = _mm_sub_epi16(T60_0D, avg);
    T60_1D = _mm_sub_epi16(T60_1D, avg);
    T61_0A = _mm_sub_epi16(T61_0A, avg);
    T61_1A = _mm_sub_epi16(T61_1A, avg);
    T61_0B = _mm_sub_epi16(T61_0B, avg);
    T61_1B = _mm_sub_epi16(T61_1B, avg);
    T61_0C = _mm_sub_epi16(T61_0C, avg);
    T61_1C = _mm_sub_epi16(T61_1C, avg);
    T61_0D = _mm_sub_epi16(T61_0D, avg);
    T61_1D = _mm_sub_epi16(T61_1D, avg);
    T62_0A = _mm_sub_epi16(T62_0A, avg);
    T62_1A = _mm_sub_epi16(T62_1A, avg);
    T62_0B = _mm_sub_epi16(T62_0B, avg);
    T62_1B = _mm_sub_epi16(T62_1B, avg);
    T62_0C = _mm_sub_epi16(T62_0C, avg);
    T62_1C = _mm_sub_epi16(T62_1C, avg);
    T62_0D = _mm_sub_epi16(T62_0D, avg);
    T62_1D = _mm_sub_epi16(T62_1D, avg);
    T63_0A = _mm_sub_epi16(T63_0A, avg);
    T63_1A = _mm_sub_epi16(T63_1A, avg);
    T63_0B = _mm_sub_epi16(T63_0B, avg);
    T63_1B = _mm_sub_epi16(T63_1B, avg);
    T63_0C = _mm_sub_epi16(T63_0C, avg);
    T63_1C = _mm_sub_epi16(T63_1C, avg);
    T63_0D = _mm_sub_epi16(T63_0D, avg);
    T63_1D = _mm_sub_epi16(T63_1D, avg);

    T0_0A = _mm_abs_epi16(T0_0A);
    T0_1A = _mm_abs_epi16(T0_1A);
    T0_0B = _mm_abs_epi16(T0_0B);
    T0_1B = _mm_abs_epi16(T0_1B);
    T0_0C = _mm_abs_epi16(T0_0C);
    T0_1C = _mm_abs_epi16(T0_1C);
    T0_0D = _mm_abs_epi16(T0_0D);
    T0_1D = _mm_abs_epi16(T0_1D);
    T0 = _mm_add_epi16(T0_0A, T0_1A);
    T0 = _mm_add_epi16(T0, T0_0B);
    T0 = _mm_add_epi16(T0, T0_1B);
    T0 = _mm_add_epi16(T0, T0_0C);
    T0 = _mm_add_epi16(T0, T0_1C);
    T0 = _mm_add_epi16(T0, T0_0D);
    T0 = _mm_add_epi16(T0, T0_1D);

    T1_0A = _mm_abs_epi16(T1_0A);
    T1_1A = _mm_abs_epi16(T1_1A);
    T1_0B = _mm_abs_epi16(T1_0B);
    T1_1B = _mm_abs_epi16(T1_1B);
    T1_0C = _mm_abs_epi16(T1_0C);
    T1_1C = _mm_abs_epi16(T1_1C);
    T1_0D = _mm_abs_epi16(T1_0D);
    T1_1D = _mm_abs_epi16(T1_1D);
    T1 = _mm_add_epi16(T1_0A, T1_1A);
    T1 = _mm_add_epi16(T1, T1_0B);
    T1 = _mm_add_epi16(T1, T1_1B);
    T1 = _mm_add_epi16(T1, T1_0C);
    T1 = _mm_add_epi16(T1, T1_1C);
    T1 = _mm_add_epi16(T1, T1_0D);
    T1 = _mm_add_epi16(T1, T1_1D);

    T2_0A = _mm_abs_epi16(T2_0A);
    T2_1A = _mm_abs_epi16(T2_1A);
    T2_0B = _mm_abs_epi16(T2_0B);
    T2_1B = _mm_abs_epi16(T2_1B);
    T2_0C = _mm_abs_epi16(T2_0C);
    T2_1C = _mm_abs_epi16(T2_1C);
    T2_0D = _mm_abs_epi16(T2_0D);
    T2_1D = _mm_abs_epi16(T2_1D);
    T2 = _mm_add_epi16(T2_0A, T2_1A);
    T2 = _mm_add_epi16(T2, T2_0B);
    T2 = _mm_add_epi16(T2, T2_1B);
    T2 = _mm_add_epi16(T2, T2_0C);
    T2 = _mm_add_epi16(T2, T2_1C);
    T2 = _mm_add_epi16(T2, T2_0D);
    T2 = _mm_add_epi16(T2, T2_1D);

    T3_0A = _mm_abs_epi16(T3_0A);
    T3_1A = _mm_abs_epi16(T3_1A);
    T3_0B = _mm_abs_epi16(T3_0B);
    T3_1B = _mm_abs_epi16(T3_1B);
    T3_0C = _mm_abs_epi16(T3_0C);
    T3_1C = _mm_abs_epi16(T3_1C);
    T3_0D = _mm_abs_epi16(T3_0D);
    T3_1D = _mm_abs_epi16(T3_1D);
    T3 = _mm_add_epi16(T3_0A, T3_1A);
    T3 = _mm_add_epi16(T3, T3_0B);
    T3 = _mm_add_epi16(T3, T3_1B);
    T3 = _mm_add_epi16(T3, T3_0C);
    T3 = _mm_add_epi16(T3, T3_1C);
    T3 = _mm_add_epi16(T3, T3_0D);
    T3 = _mm_add_epi16(T3, T3_1D);

    T4_0A = _mm_abs_epi16(T4_0A);
    T4_1A = _mm_abs_epi16(T4_1A);
    T4_0B = _mm_abs_epi16(T4_0B);
    T4_1B = _mm_abs_epi16(T4_1B);
    T4_0C = _mm_abs_epi16(T4_0C);
    T4_1C = _mm_abs_epi16(T4_1C);
    T4_0D = _mm_abs_epi16(T4_0D);
    T4_1D = _mm_abs_epi16(T4_1D);
    T4 = _mm_add_epi16(T4_0A, T4_1A);
    T4 = _mm_add_epi16(T4, T4_0B);
    T4 = _mm_add_epi16(T4, T4_1B);
    T4 = _mm_add_epi16(T4, T4_0C);
    T4 = _mm_add_epi16(T4, T4_1C);
    T4 = _mm_add_epi16(T4, T4_0D);
    T4 = _mm_add_epi16(T4, T4_1D);

    T5_0A = _mm_abs_epi16(T5_0A);
    T5_1A = _mm_abs_epi16(T5_1A);
    T5_0B = _mm_abs_epi16(T5_0B);
    T5_1B = _mm_abs_epi16(T5_1B);
    T5_0C = _mm_abs_epi16(T5_0C);
    T5_1C = _mm_abs_epi16(T5_1C);
    T5_0D = _mm_abs_epi16(T5_0D);
    T5_1D = _mm_abs_epi16(T5_1D);
    T5 = _mm_add_epi16(T5_0A, T5_1A);
    T5 = _mm_add_epi16(T5, T5_0B);
    T5 = _mm_add_epi16(T5, T5_1B);
    T5 = _mm_add_epi16(T5, T5_0C);
    T5 = _mm_add_epi16(T5, T5_1C);
    T5 = _mm_add_epi16(T5, T5_0D);
    T5 = _mm_add_epi16(T5, T5_1D);

    T6_0A = _mm_abs_epi16(T6_0A);
    T6_1A = _mm_abs_epi16(T6_1A);
    T6_0B = _mm_abs_epi16(T6_0B);
    T6_1B = _mm_abs_epi16(T6_1B);
    T6_0C = _mm_abs_epi16(T6_0C);
    T6_1C = _mm_abs_epi16(T6_1C);
    T6_0D = _mm_abs_epi16(T6_0D);
    T6_1D = _mm_abs_epi16(T6_1D);
    T6 = _mm_add_epi16(T6_0A, T6_1A);
    T6 = _mm_add_epi16(T6, T6_0B);
    T6 = _mm_add_epi16(T6, T6_1B);
    T6 = _mm_add_epi16(T6, T6_0C);
    T6 = _mm_add_epi16(T6, T6_1C);
    T6 = _mm_add_epi16(T6, T6_0D);
    T6 = _mm_add_epi16(T6, T6_1D);

    T7_0A = _mm_abs_epi16(T7_0A);
    T7_1A = _mm_abs_epi16(T7_1A);
    T7_0B = _mm_abs_epi16(T7_0B);
    T7_1B = _mm_abs_epi16(T7_1B);
    T7_0C = _mm_abs_epi16(T7_0C);
    T7_1C = _mm_abs_epi16(T7_1C);
    T7_0D = _mm_abs_epi16(T7_0D);
    T7_1D = _mm_abs_epi16(T7_1D);
    T7 = _mm_add_epi16(T7_0A, T7_1A);
    T7 = _mm_add_epi16(T7, T7_0B);
    T7 = _mm_add_epi16(T7, T7_1B);
    T7 = _mm_add_epi16(T7, T7_0C);
    T7 = _mm_add_epi16(T7, T7_1C);
    T7 = _mm_add_epi16(T7, T7_0D);
    T7 = _mm_add_epi16(T7, T7_1D);

    T8_0A = _mm_abs_epi16(T8_0A);
    T8_1A = _mm_abs_epi16(T8_1A);
    T8_0B = _mm_abs_epi16(T8_0B);
    T8_1B = _mm_abs_epi16(T8_1B);
    T8_0C = _mm_abs_epi16(T8_0C);
    T8_1C = _mm_abs_epi16(T8_1C);
    T8_0D = _mm_abs_epi16(T8_0D);
    T8_1D = _mm_abs_epi16(T8_1D);
    T8 = _mm_add_epi16(T8_0A, T8_1A);
    T8 = _mm_add_epi16(T8, T8_0B);
    T8 = _mm_add_epi16(T8, T8_1B);
    T8 = _mm_add_epi16(T8, T8_0C);
    T8 = _mm_add_epi16(T8, T8_1C);
    T8 = _mm_add_epi16(T8, T8_0D);
    T8 = _mm_add_epi16(T8, T8_1D);

    T9_0A = _mm_abs_epi16(T9_0A);
    T9_1A = _mm_abs_epi16(T9_1A);
    T9_0B = _mm_abs_epi16(T9_0B);
    T9_1B = _mm_abs_epi16(T9_1B);
    T9_0C = _mm_abs_epi16(T9_0C);
    T9_1C = _mm_abs_epi16(T9_1C);
    T9_0D = _mm_abs_epi16(T9_0D);
    T9_1D = _mm_abs_epi16(T9_1D);
    T9 = _mm_add_epi16(T9_0A, T9_1A);
    T9 = _mm_add_epi16(T9, T9_0B);
    T9 = _mm_add_epi16(T9, T9_1B);
    T9 = _mm_add_epi16(T9, T9_0C);
    T9 = _mm_add_epi16(T9, T9_1C);
    T9 = _mm_add_epi16(T9, T9_0D);
    T9 = _mm_add_epi16(T9, T9_1D);

    T10_0A = _mm_abs_epi16(T10_0A);
    T10_1A = _mm_abs_epi16(T10_1A);
    T10_0B = _mm_abs_epi16(T10_0B);
    T10_1B = _mm_abs_epi16(T10_1B);
    T10_0C = _mm_abs_epi16(T10_0C);
    T10_1C = _mm_abs_epi16(T10_1C);
    T10_0D = _mm_abs_epi16(T10_0D);
    T10_1D = _mm_abs_epi16(T10_1D);
    T10 = _mm_add_epi16(T10_0A, T10_1A);
    T10 = _mm_add_epi16(T10, T10_0B);
    T10 = _mm_add_epi16(T10, T10_1B);
    T10 = _mm_add_epi16(T10, T10_0C);
    T10 = _mm_add_epi16(T10, T10_1C);
    T10 = _mm_add_epi16(T10, T10_0D);
    T10 = _mm_add_epi16(T10, T10_1D);

    T11_0A = _mm_abs_epi16(T11_0A);
    T11_1A = _mm_abs_epi16(T11_1A);
    T11_0B = _mm_abs_epi16(T11_0B);
    T11_1B = _mm_abs_epi16(T11_1B);
    T11_0C = _mm_abs_epi16(T11_0C);
    T11_1C = _mm_abs_epi16(T11_1C);
    T11_0D = _mm_abs_epi16(T11_0D);
    T11_1D = _mm_abs_epi16(T11_1D);
    T11 = _mm_add_epi16(T11_0A, T11_1A);
    T11 = _mm_add_epi16(T11, T11_0B);
    T11 = _mm_add_epi16(T11, T11_1B);
    T11 = _mm_add_epi16(T11, T11_0C);
    T11 = _mm_add_epi16(T11, T11_1C);
    T11 = _mm_add_epi16(T11, T11_0D);
    T11 = _mm_add_epi16(T11, T11_1D);

    T12_0A = _mm_abs_epi16(T12_0A);
    T12_1A = _mm_abs_epi16(T12_1A);
    T12_0B = _mm_abs_epi16(T12_0B);
    T12_1B = _mm_abs_epi16(T12_1B);
    T12_0C = _mm_abs_epi16(T12_0C);
    T12_1C = _mm_abs_epi16(T12_1C);
    T12_0D = _mm_abs_epi16(T12_0D);
    T12_1D = _mm_abs_epi16(T12_1D);
    T12 = _mm_add_epi16(T12_0A, T12_1A);
    T12 = _mm_add_epi16(T12, T12_0B);
    T12 = _mm_add_epi16(T12, T12_1B);
    T12 = _mm_add_epi16(T12, T12_0C);
    T12 = _mm_add_epi16(T12, T12_1C);
    T12 = _mm_add_epi16(T12, T12_0D);
    T12 = _mm_add_epi16(T12, T12_1D);

    T13_0A = _mm_abs_epi16(T13_0A);
    T13_1A = _mm_abs_epi16(T13_1A);
    T13_0B = _mm_abs_epi16(T13_0B);
    T13_1B = _mm_abs_epi16(T13_1B);
    T13_0C = _mm_abs_epi16(T13_0C);
    T13_1C = _mm_abs_epi16(T13_1C);
    T13_0D = _mm_abs_epi16(T13_0D);
    T13_1D = _mm_abs_epi16(T13_1D);
    T13 = _mm_add_epi16(T13_0A, T13_1A);
    T13 = _mm_add_epi16(T13, T13_0B);
    T13 = _mm_add_epi16(T13, T13_1B);
    T13 = _mm_add_epi16(T13, T13_0C);
    T13 = _mm_add_epi16(T13, T13_1C);
    T13 = _mm_add_epi16(T13, T13_0D);
    T13 = _mm_add_epi16(T13, T13_1D);

    T14_0A = _mm_abs_epi16(T14_0A);
    T14_1A = _mm_abs_epi16(T14_1A);
    T14_0B = _mm_abs_epi16(T14_0B);
    T14_1B = _mm_abs_epi16(T14_1B);
    T14_0C = _mm_abs_epi16(T14_0C);
    T14_1C = _mm_abs_epi16(T14_1C);
    T14_0D = _mm_abs_epi16(T14_0D);
    T14_1D = _mm_abs_epi16(T14_1D);
    T14 = _mm_add_epi16(T14_0A, T14_1A);
    T14 = _mm_add_epi16(T14, T14_0B);
    T14 = _mm_add_epi16(T14, T14_1B);
    T14 = _mm_add_epi16(T14, T14_0C);
    T14 = _mm_add_epi16(T14, T14_1C);
    T14 = _mm_add_epi16(T14, T14_0D);
    T14 = _mm_add_epi16(T14, T14_1D);

    T15_0A = _mm_abs_epi16(T15_0A);
    T15_1A = _mm_abs_epi16(T15_1A);
    T15_0B = _mm_abs_epi16(T15_0B);
    T15_1B = _mm_abs_epi16(T15_1B);
    T15_0C = _mm_abs_epi16(T15_0C);
    T15_1C = _mm_abs_epi16(T15_1C);
    T15_0D = _mm_abs_epi16(T15_0D);
    T15_1D = _mm_abs_epi16(T15_1D);
    T15 = _mm_add_epi16(T15_0A, T15_1A);
    T15 = _mm_add_epi16(T15, T15_0B);
    T15 = _mm_add_epi16(T15, T15_1B);
    T15 = _mm_add_epi16(T15, T15_0C);
    T15 = _mm_add_epi16(T15, T15_1C);
    T15 = _mm_add_epi16(T15, T15_0D);
    T15 = _mm_add_epi16(T15, T15_1D);

    T16_0A = _mm_abs_epi16(T16_0A);
    T16_1A = _mm_abs_epi16(T16_1A);
    T16_0B = _mm_abs_epi16(T16_0B);
    T16_1B = _mm_abs_epi16(T16_1B);
    T16_0C = _mm_abs_epi16(T16_0C);
    T16_1C = _mm_abs_epi16(T16_1C);
    T16_0D = _mm_abs_epi16(T16_0D);
    T16_1D = _mm_abs_epi16(T16_1D);
    T16 = _mm_add_epi16(T16_0A, T16_1A);
    T16 = _mm_add_epi16(T16, T16_0B);
    T16 = _mm_add_epi16(T16, T16_1B);
    T16 = _mm_add_epi16(T16, T16_0C);
    T16 = _mm_add_epi16(T16, T16_1C);
    T16 = _mm_add_epi16(T16, T16_0D);
    T16 = _mm_add_epi16(T16, T16_1D);

    T17_0A = _mm_abs_epi16(T17_0A);
    T17_1A = _mm_abs_epi16(T17_1A);
    T17_0B = _mm_abs_epi16(T17_0B);
    T17_1B = _mm_abs_epi16(T17_1B);
    T17_0C = _mm_abs_epi16(T17_0C);
    T17_1C = _mm_abs_epi16(T17_1C);
    T17_0D = _mm_abs_epi16(T17_0D);
    T17_1D = _mm_abs_epi16(T17_1D);
    T17 = _mm_add_epi16(T17_0A, T17_1A);
    T17 = _mm_add_epi16(T17, T17_0B);
    T17 = _mm_add_epi16(T17, T17_1B);
    T17 = _mm_add_epi16(T17, T17_0C);
    T17 = _mm_add_epi16(T17, T17_1C);
    T17 = _mm_add_epi16(T17, T17_0D);
    T17 = _mm_add_epi16(T17, T17_1D);

    T18_0A = _mm_abs_epi16(T18_0A);
    T18_1A = _mm_abs_epi16(T18_1A);
    T18_0B = _mm_abs_epi16(T18_0B);
    T18_1B = _mm_abs_epi16(T18_1B);
    T18_0C = _mm_abs_epi16(T18_0C);
    T18_1C = _mm_abs_epi16(T18_1C);
    T18_0D = _mm_abs_epi16(T18_0D);
    T18_1D = _mm_abs_epi16(T18_1D);
    T18 = _mm_add_epi16(T18_0A, T18_1A);
    T18 = _mm_add_epi16(T18, T18_0B);
    T18 = _mm_add_epi16(T18, T18_1B);
    T18 = _mm_add_epi16(T18, T18_0C);
    T18 = _mm_add_epi16(T18, T18_1C);
    T18 = _mm_add_epi16(T18, T18_0D);
    T18 = _mm_add_epi16(T18, T18_1D);

    T19_0A = _mm_abs_epi16(T19_0A);
    T19_1A = _mm_abs_epi16(T19_1A);
    T19_0B = _mm_abs_epi16(T19_0B);
    T19_1B = _mm_abs_epi16(T19_1B);
    T19_0C = _mm_abs_epi16(T19_0C);
    T19_1C = _mm_abs_epi16(T19_1C);
    T19_0D = _mm_abs_epi16(T19_0D);
    T19_1D = _mm_abs_epi16(T19_1D);
    T19 = _mm_add_epi16(T19_0A, T19_1A);
    T19 = _mm_add_epi16(T19, T19_0B);
    T19 = _mm_add_epi16(T19, T19_1B);
    T19 = _mm_add_epi16(T19, T19_0C);
    T19 = _mm_add_epi16(T19, T19_1C);
    T19 = _mm_add_epi16(T19, T19_0D);
    T19 = _mm_add_epi16(T19, T19_1D);

    T20_0A = _mm_abs_epi16(T20_0A);
    T20_1A = _mm_abs_epi16(T20_1A);
    T20_0B = _mm_abs_epi16(T20_0B);
    T20_1B = _mm_abs_epi16(T20_1B);
    T20_0C = _mm_abs_epi16(T20_0C);
    T20_1C = _mm_abs_epi16(T20_1C);
    T20_0D = _mm_abs_epi16(T20_0D);
    T20_1D = _mm_abs_epi16(T20_1D);
    T20 = _mm_add_epi16(T20_0A, T20_1A);
    T20 = _mm_add_epi16(T20, T20_0B);
    T20 = _mm_add_epi16(T20, T20_1B);
    T20 = _mm_add_epi16(T20, T20_0C);
    T20 = _mm_add_epi16(T20, T20_1C);
    T20 = _mm_add_epi16(T20, T20_0D);
    T20 = _mm_add_epi16(T20, T20_1D);

    T21_0A = _mm_abs_epi16(T21_0A);
    T21_1A = _mm_abs_epi16(T21_1A);
    T21_0B = _mm_abs_epi16(T21_0B);
    T21_1B = _mm_abs_epi16(T21_1B);
    T21_0C = _mm_abs_epi16(T21_0C);
    T21_1C = _mm_abs_epi16(T21_1C);
    T21_0D = _mm_abs_epi16(T21_0D);
    T21_1D = _mm_abs_epi16(T21_1D);
    T21 = _mm_add_epi16(T21_0A, T21_1A);
    T21 = _mm_add_epi16(T21, T21_0B);
    T21 = _mm_add_epi16(T21, T21_1B);
    T21 = _mm_add_epi16(T21, T21_0C);
    T21 = _mm_add_epi16(T21, T21_1C);
    T21 = _mm_add_epi16(T21, T21_0D);
    T21 = _mm_add_epi16(T21, T21_1D);

    T22_0A = _mm_abs_epi16(T22_0A);
    T22_1A = _mm_abs_epi16(T22_1A);
    T22_0B = _mm_abs_epi16(T22_0B);
    T22_1B = _mm_abs_epi16(T22_1B);
    T22_0C = _mm_abs_epi16(T22_0C);
    T22_1C = _mm_abs_epi16(T22_1C);
    T22_0D = _mm_abs_epi16(T22_0D);
    T22_1D = _mm_abs_epi16(T22_1D);
    T22 = _mm_add_epi16(T22_0A, T22_1A);
    T22 = _mm_add_epi16(T22, T22_0B);
    T22 = _mm_add_epi16(T22, T22_1B);
    T22 = _mm_add_epi16(T22, T22_0C);
    T22 = _mm_add_epi16(T22, T22_1C);
    T22 = _mm_add_epi16(T22, T22_0D);
    T22 = _mm_add_epi16(T22, T22_1D);

    T23_0A = _mm_abs_epi16(T23_0A);
    T23_1A = _mm_abs_epi16(T23_1A);
    T23_0B = _mm_abs_epi16(T23_0B);
    T23_1B = _mm_abs_epi16(T23_1B);
    T23_0C = _mm_abs_epi16(T23_0C);
    T23_1C = _mm_abs_epi16(T23_1C);
    T23_0D = _mm_abs_epi16(T23_0D);
    T23_1D = _mm_abs_epi16(T23_1D);
    T23 = _mm_add_epi16(T23_0A, T23_1A);
    T23 = _mm_add_epi16(T23, T23_0B);
    T23 = _mm_add_epi16(T23, T23_1B);
    T23 = _mm_add_epi16(T23, T23_0C);
    T23 = _mm_add_epi16(T23, T23_1C);
    T23 = _mm_add_epi16(T23, T23_0D);
    T23 = _mm_add_epi16(T23, T23_1D);

    T24_0A = _mm_abs_epi16(T24_0A);
    T24_1A = _mm_abs_epi16(T24_1A);
    T24_0B = _mm_abs_epi16(T24_0B);
    T24_1B = _mm_abs_epi16(T24_1B);
    T24_0C = _mm_abs_epi16(T24_0C);
    T24_1C = _mm_abs_epi16(T24_1C);
    T24_0D = _mm_abs_epi16(T24_0D);
    T24_1D = _mm_abs_epi16(T24_1D);
    T24 = _mm_add_epi16(T24_0A, T24_1A);
    T24 = _mm_add_epi16(T24, T24_0B);
    T24 = _mm_add_epi16(T24, T24_1B);
    T24 = _mm_add_epi16(T24, T24_0C);
    T24 = _mm_add_epi16(T24, T24_1C);
    T24 = _mm_add_epi16(T24, T24_0D);
    T24 = _mm_add_epi16(T24, T24_1D);

    T25_0A = _mm_abs_epi16(T25_0A);
    T25_1A = _mm_abs_epi16(T25_1A);
    T25_0B = _mm_abs_epi16(T25_0B);
    T25_1B = _mm_abs_epi16(T25_1B);
    T25_0C = _mm_abs_epi16(T25_0C);
    T25_1C = _mm_abs_epi16(T25_1C);
    T25_0D = _mm_abs_epi16(T25_0D);
    T25_1D = _mm_abs_epi16(T25_1D);
    T25 = _mm_add_epi16(T25_0A, T25_1A);
    T25 = _mm_add_epi16(T25, T25_0B);
    T25 = _mm_add_epi16(T25, T25_1B);
    T25 = _mm_add_epi16(T25, T25_0C);
    T25 = _mm_add_epi16(T25, T25_1C);
    T25 = _mm_add_epi16(T25, T25_0D);
    T25 = _mm_add_epi16(T25, T25_1D);

    T26_0A = _mm_abs_epi16(T26_0A);
    T26_1A = _mm_abs_epi16(T26_1A);
    T26_0B = _mm_abs_epi16(T26_0B);
    T26_1B = _mm_abs_epi16(T26_1B);
    T26_0C = _mm_abs_epi16(T26_0C);
    T26_1C = _mm_abs_epi16(T26_1C);
    T26_0D = _mm_abs_epi16(T26_0D);
    T26_1D = _mm_abs_epi16(T26_1D);
    T26 = _mm_add_epi16(T26_0A, T26_1A);
    T26 = _mm_add_epi16(T26, T26_0B);
    T26 = _mm_add_epi16(T26, T26_1B);
    T26 = _mm_add_epi16(T26, T26_0C);
    T26 = _mm_add_epi16(T26, T26_1C);
    T26 = _mm_add_epi16(T26, T26_0D);
    T26 = _mm_add_epi16(T26, T26_1D);

    T27_0A = _mm_abs_epi16(T27_0A);
    T27_1A = _mm_abs_epi16(T27_1A);
    T27_0B = _mm_abs_epi16(T27_0B);
    T27_1B = _mm_abs_epi16(T27_1B);
    T27_0C = _mm_abs_epi16(T27_0C);
    T27_1C = _mm_abs_epi16(T27_1C);
    T27_0D = _mm_abs_epi16(T27_0D);
    T27_1D = _mm_abs_epi16(T27_1D);
    T27 = _mm_add_epi16(T27_0A, T27_1A);
    T27 = _mm_add_epi16(T27, T27_0B);
    T27 = _mm_add_epi16(T27, T27_1B);
    T27 = _mm_add_epi16(T27, T27_0C);
    T27 = _mm_add_epi16(T27, T27_1C);
    T27 = _mm_add_epi16(T27, T27_0D);
    T27 = _mm_add_epi16(T27, T27_1D);

    T28_0A = _mm_abs_epi16(T28_0A);
    T28_1A = _mm_abs_epi16(T28_1A);
    T28_0B = _mm_abs_epi16(T28_0B);
    T28_1B = _mm_abs_epi16(T28_1B);
    T28_0C = _mm_abs_epi16(T28_0C);
    T28_1C = _mm_abs_epi16(T28_1C);
    T28_0D = _mm_abs_epi16(T28_0D);
    T28_1D = _mm_abs_epi16(T28_1D);
    T28 = _mm_add_epi16(T28_0A, T28_1A);
    T28 = _mm_add_epi16(T28, T28_0B);
    T28 = _mm_add_epi16(T28, T28_1B);
    T28 = _mm_add_epi16(T28, T28_0C);
    T28 = _mm_add_epi16(T28, T28_1C);
    T28 = _mm_add_epi16(T28, T28_0D);
    T28 = _mm_add_epi16(T28, T28_1D);

    T29_0A = _mm_abs_epi16(T29_0A);
    T29_1A = _mm_abs_epi16(T29_1A);
    T29_0B = _mm_abs_epi16(T29_0B);
    T29_1B = _mm_abs_epi16(T29_1B);
    T29_0C = _mm_abs_epi16(T29_0C);
    T29_1C = _mm_abs_epi16(T29_1C);
    T29_0D = _mm_abs_epi16(T29_0D);
    T29_1D = _mm_abs_epi16(T29_1D);
    T29 = _mm_add_epi16(T29_0A, T29_1A);
    T29 = _mm_add_epi16(T29, T29_0B);
    T29 = _mm_add_epi16(T29, T29_1B);
    T29 = _mm_add_epi16(T29, T29_0C);
    T29 = _mm_add_epi16(T29, T29_1C);
    T29 = _mm_add_epi16(T29, T29_0D);
    T29 = _mm_add_epi16(T29, T29_1D);

    T30_0A = _mm_abs_epi16(T30_0A);
    T30_1A = _mm_abs_epi16(T30_1A);
    T30_0B = _mm_abs_epi16(T30_0B);
    T30_1B = _mm_abs_epi16(T30_1B);
    T30_0C = _mm_abs_epi16(T30_0C);
    T30_1C = _mm_abs_epi16(T30_1C);
    T30_0D = _mm_abs_epi16(T30_0D);
    T30_1D = _mm_abs_epi16(T30_1D);
    T30 = _mm_add_epi16(T30_0A, T30_1A);
    T30 = _mm_add_epi16(T30, T30_0B);
    T30 = _mm_add_epi16(T30, T30_1B);
    T30 = _mm_add_epi16(T30, T30_0C);
    T30 = _mm_add_epi16(T30, T30_1C);
    T30 = _mm_add_epi16(T30, T30_0D);
    T30 = _mm_add_epi16(T30, T30_1D);

    T31_0A = _mm_abs_epi16(T31_0A);
    T31_1A = _mm_abs_epi16(T31_1A);
    T31_0B = _mm_abs_epi16(T31_0B);
    T31_1B = _mm_abs_epi16(T31_1B);
    T31_0C = _mm_abs_epi16(T31_0C);
    T31_1C = _mm_abs_epi16(T31_1C);
    T31_0D = _mm_abs_epi16(T31_0D);
    T31_1D = _mm_abs_epi16(T31_1D);
    T31 = _mm_add_epi16(T31_0A, T31_1A);
    T31 = _mm_add_epi16(T31, T31_0B);
    T31 = _mm_add_epi16(T31, T31_1B);
    T31 = _mm_add_epi16(T31, T31_0C);
    T31 = _mm_add_epi16(T31, T31_1C);
    T31 = _mm_add_epi16(T31, T31_0D);
    T31 = _mm_add_epi16(T31, T31_1D);

    T32_0A = _mm_abs_epi16(T32_0A);
    T32_1A = _mm_abs_epi16(T32_1A);
    T32_0B = _mm_abs_epi16(T32_0B);
    T32_1B = _mm_abs_epi16(T32_1B);
    T32_0C = _mm_abs_epi16(T32_0C);
    T32_1C = _mm_abs_epi16(T32_1C);
    T32_0D = _mm_abs_epi16(T32_0D);
    T32_1D = _mm_abs_epi16(T32_1D);
    T32 = _mm_add_epi16(T32_0A, T32_1A);
    T32 = _mm_add_epi16(T32, T32_0B);
    T32 = _mm_add_epi16(T32, T32_1B);
    T32 = _mm_add_epi16(T32, T32_0C);
    T32 = _mm_add_epi16(T32, T32_1C);
    T32 = _mm_add_epi16(T32, T32_0D);
    T32 = _mm_add_epi16(T32, T32_1D);

    T33_0A = _mm_abs_epi16(T33_0A);
    T33_1A = _mm_abs_epi16(T33_1A);
    T33_0B = _mm_abs_epi16(T33_0B);
    T33_1B = _mm_abs_epi16(T33_1B);
    T33_0C = _mm_abs_epi16(T33_0C);
    T33_1C = _mm_abs_epi16(T33_1C);
    T33_0D = _mm_abs_epi16(T33_0D);
    T33_1D = _mm_abs_epi16(T33_1D);
    T33 = _mm_add_epi16(T33_0A, T33_1A);
    T33 = _mm_add_epi16(T33, T33_0B);
    T33 = _mm_add_epi16(T33, T33_1B);
    T33 = _mm_add_epi16(T33, T33_0C);
    T33 = _mm_add_epi16(T33, T33_1C);
    T33 = _mm_add_epi16(T33, T33_0D);
    T33 = _mm_add_epi16(T33, T33_1D);

    T34_0A = _mm_abs_epi16(T34_0A);
    T34_1A = _mm_abs_epi16(T34_1A);
    T34_0B = _mm_abs_epi16(T34_0B);
    T34_1B = _mm_abs_epi16(T34_1B);
    T34_0C = _mm_abs_epi16(T34_0C);
    T34_1C = _mm_abs_epi16(T34_1C);
    T34_0D = _mm_abs_epi16(T34_0D);
    T34_1D = _mm_abs_epi16(T34_1D);
    T34 = _mm_add_epi16(T34_0A, T34_1A);
    T34 = _mm_add_epi16(T34, T34_0B);
    T34 = _mm_add_epi16(T34, T34_1B);
    T34 = _mm_add_epi16(T34, T34_0C);
    T34 = _mm_add_epi16(T34, T34_1C);
    T34 = _mm_add_epi16(T34, T34_0D);
    T34 = _mm_add_epi16(T34, T34_1D);

    T35_0A = _mm_abs_epi16(T35_0A);
    T35_1A = _mm_abs_epi16(T35_1A);
    T35_0B = _mm_abs_epi16(T35_0B);
    T35_1B = _mm_abs_epi16(T35_1B);
    T35_0C = _mm_abs_epi16(T35_0C);
    T35_1C = _mm_abs_epi16(T35_1C);
    T35_0D = _mm_abs_epi16(T35_0D);
    T35_1D = _mm_abs_epi16(T35_1D);
    T35 = _mm_add_epi16(T35_0A, T35_1A);
    T35 = _mm_add_epi16(T35, T35_0B);
    T35 = _mm_add_epi16(T35, T35_1B);
    T35 = _mm_add_epi16(T35, T35_0C);
    T35 = _mm_add_epi16(T35, T35_1C);
    T35 = _mm_add_epi16(T35, T35_0D);
    T35 = _mm_add_epi16(T35, T35_1D);

    T36_0A = _mm_abs_epi16(T36_0A);
    T36_1A = _mm_abs_epi16(T36_1A);
    T36_0B = _mm_abs_epi16(T36_0B);
    T36_1B = _mm_abs_epi16(T36_1B);
    T36_0C = _mm_abs_epi16(T36_0C);
    T36_1C = _mm_abs_epi16(T36_1C);
    T36_0D = _mm_abs_epi16(T36_0D);
    T36_1D = _mm_abs_epi16(T36_1D);
    T36 = _mm_add_epi16(T36_0A, T36_1A);
    T36 = _mm_add_epi16(T36, T36_0B);
    T36 = _mm_add_epi16(T36, T36_1B);
    T36 = _mm_add_epi16(T36, T36_0C);
    T36 = _mm_add_epi16(T36, T36_1C);
    T36 = _mm_add_epi16(T36, T36_0D);
    T36 = _mm_add_epi16(T36, T36_1D);

    T37_0A = _mm_abs_epi16(T37_0A);
    T37_1A = _mm_abs_epi16(T37_1A);
    T37_0B = _mm_abs_epi16(T37_0B);
    T37_1B = _mm_abs_epi16(T37_1B);
    T37_0C = _mm_abs_epi16(T37_0C);
    T37_1C = _mm_abs_epi16(T37_1C);
    T37_0D = _mm_abs_epi16(T37_0D);
    T37_1D = _mm_abs_epi16(T37_1D);
    T37 = _mm_add_epi16(T37_0A, T37_1A);
    T37 = _mm_add_epi16(T37, T37_0B);
    T37 = _mm_add_epi16(T37, T37_1B);
    T37 = _mm_add_epi16(T37, T37_0C);
    T37 = _mm_add_epi16(T37, T37_1C);
    T37 = _mm_add_epi16(T37, T37_0D);
    T37 = _mm_add_epi16(T37, T37_1D);

    T38_0A = _mm_abs_epi16(T38_0A);
    T38_1A = _mm_abs_epi16(T38_1A);
    T38_0B = _mm_abs_epi16(T38_0B);
    T38_1B = _mm_abs_epi16(T38_1B);
    T38_0C = _mm_abs_epi16(T38_0C);
    T38_1C = _mm_abs_epi16(T38_1C);
    T38_0D = _mm_abs_epi16(T38_0D);
    T38_1D = _mm_abs_epi16(T38_1D);
    T38 = _mm_add_epi16(T38_0A, T38_1A);
    T38 = _mm_add_epi16(T38, T38_0B);
    T38 = _mm_add_epi16(T38, T38_1B);
    T38 = _mm_add_epi16(T38, T38_0C);
    T38 = _mm_add_epi16(T38, T38_1C);
    T38 = _mm_add_epi16(T38, T38_0D);
    T38 = _mm_add_epi16(T38, T38_1D);

    T39_0A = _mm_abs_epi16(T39_0A);
    T39_1A = _mm_abs_epi16(T39_1A);
    T39_0B = _mm_abs_epi16(T39_0B);
    T39_1B = _mm_abs_epi16(T39_1B);
    T39_0C = _mm_abs_epi16(T39_0C);
    T39_1C = _mm_abs_epi16(T39_1C);
    T39_0D = _mm_abs_epi16(T39_0D);
    T39_1D = _mm_abs_epi16(T39_1D);
    T39 = _mm_add_epi16(T39_0A, T39_1A);
    T39 = _mm_add_epi16(T39, T39_0B);
    T39 = _mm_add_epi16(T39, T39_1B);
    T39 = _mm_add_epi16(T39, T39_0C);
    T39 = _mm_add_epi16(T39, T39_1C);
    T39 = _mm_add_epi16(T39, T39_0D);
    T39 = _mm_add_epi16(T39, T39_1D);

    T40_0A = _mm_abs_epi16(T40_0A);
    T40_1A = _mm_abs_epi16(T40_1A);
    T40_0B = _mm_abs_epi16(T40_0B);
    T40_1B = _mm_abs_epi16(T40_1B);
    T40_0C = _mm_abs_epi16(T40_0C);
    T40_1C = _mm_abs_epi16(T40_1C);
    T40_0D = _mm_abs_epi16(T40_0D);
    T40_1D = _mm_abs_epi16(T40_1D);
    T40 = _mm_add_epi16(T40_0A, T40_1A);
    T40 = _mm_add_epi16(T40, T40_0B);
    T40 = _mm_add_epi16(T40, T40_1B);
    T40 = _mm_add_epi16(T40, T40_0C);
    T40 = _mm_add_epi16(T40, T40_1C);
    T40 = _mm_add_epi16(T40, T40_0D);
    T40 = _mm_add_epi16(T40, T40_1D);

    T41_0A = _mm_abs_epi16(T41_0A);
    T41_1A = _mm_abs_epi16(T41_1A);
    T41_0B = _mm_abs_epi16(T41_0B);
    T41_1B = _mm_abs_epi16(T41_1B);
    T41_0C = _mm_abs_epi16(T41_0C);
    T41_1C = _mm_abs_epi16(T41_1C);
    T41_0D = _mm_abs_epi16(T41_0D);
    T41_1D = _mm_abs_epi16(T41_1D);
    T41 = _mm_add_epi16(T41_0A, T41_1A);
    T41 = _mm_add_epi16(T41, T41_0B);
    T41 = _mm_add_epi16(T41, T41_1B);
    T41 = _mm_add_epi16(T41, T41_0C);
    T41 = _mm_add_epi16(T41, T41_1C);
    T41 = _mm_add_epi16(T41, T41_0D);
    T41 = _mm_add_epi16(T41, T41_1D);

    T42_0A = _mm_abs_epi16(T42_0A);
    T42_1A = _mm_abs_epi16(T42_1A);
    T42_0B = _mm_abs_epi16(T42_0B);
    T42_1B = _mm_abs_epi16(T42_1B);
    T42_0C = _mm_abs_epi16(T42_0C);
    T42_1C = _mm_abs_epi16(T42_1C);
    T42_0D = _mm_abs_epi16(T42_0D);
    T42_1D = _mm_abs_epi16(T42_1D);
    T42 = _mm_add_epi16(T42_0A, T42_1A);
    T42 = _mm_add_epi16(T42, T42_0B);
    T42 = _mm_add_epi16(T42, T42_1B);
    T42 = _mm_add_epi16(T42, T42_0C);
    T42 = _mm_add_epi16(T42, T42_1C);
    T42 = _mm_add_epi16(T42, T42_0D);
    T42 = _mm_add_epi16(T42, T42_1D);

    T43_0A = _mm_abs_epi16(T43_0A);
    T43_1A = _mm_abs_epi16(T43_1A);
    T43_0B = _mm_abs_epi16(T43_0B);
    T43_1B = _mm_abs_epi16(T43_1B);
    T43_0C = _mm_abs_epi16(T43_0C);
    T43_1C = _mm_abs_epi16(T43_1C);
    T43_0D = _mm_abs_epi16(T43_0D);
    T43_1D = _mm_abs_epi16(T43_1D);
    T43 = _mm_add_epi16(T43_0A, T43_1A);
    T43 = _mm_add_epi16(T43, T43_0B);
    T43 = _mm_add_epi16(T43, T43_1B);
    T43 = _mm_add_epi16(T43, T43_0C);
    T43 = _mm_add_epi16(T43, T43_1C);
    T43 = _mm_add_epi16(T43, T43_0D);
    T43 = _mm_add_epi16(T43, T43_1D);

    T44_0A = _mm_abs_epi16(T44_0A);
    T44_1A = _mm_abs_epi16(T44_1A);
    T44_0B = _mm_abs_epi16(T44_0B);
    T44_1B = _mm_abs_epi16(T44_1B);
    T44_0C = _mm_abs_epi16(T44_0C);
    T44_1C = _mm_abs_epi16(T44_1C);
    T44_0D = _mm_abs_epi16(T44_0D);
    T44_1D = _mm_abs_epi16(T44_1D);
    T44 = _mm_add_epi16(T44_0A, T44_1A);
    T44 = _mm_add_epi16(T44, T44_0B);
    T44 = _mm_add_epi16(T44, T44_1B);
    T44 = _mm_add_epi16(T44, T44_0C);
    T44 = _mm_add_epi16(T44, T44_1C);
    T44 = _mm_add_epi16(T44, T44_0D);
    T44 = _mm_add_epi16(T44, T44_1D);

    T45_0A = _mm_abs_epi16(T45_0A);
    T45_1A = _mm_abs_epi16(T45_1A);
    T45_0B = _mm_abs_epi16(T45_0B);
    T45_1B = _mm_abs_epi16(T45_1B);
    T45_0C = _mm_abs_epi16(T45_0C);
    T45_1C = _mm_abs_epi16(T45_1C);
    T45_0D = _mm_abs_epi16(T45_0D);
    T45_1D = _mm_abs_epi16(T45_1D);
    T45 = _mm_add_epi16(T45_0A, T45_1A);
    T45 = _mm_add_epi16(T45, T45_0B);
    T45 = _mm_add_epi16(T45, T45_1B);
    T45 = _mm_add_epi16(T45, T45_0C);
    T45 = _mm_add_epi16(T45, T45_1C);
    T45 = _mm_add_epi16(T45, T45_0D);
    T45 = _mm_add_epi16(T45, T45_1D);

    T46_0A = _mm_abs_epi16(T46_0A);
    T46_1A = _mm_abs_epi16(T46_1A);
    T46_0B = _mm_abs_epi16(T46_0B);
    T46_1B = _mm_abs_epi16(T46_1B);
    T46_0C = _mm_abs_epi16(T46_0C);
    T46_1C = _mm_abs_epi16(T46_1C);
    T46_0D = _mm_abs_epi16(T46_0D);
    T46_1D = _mm_abs_epi16(T46_1D);
    T46 = _mm_add_epi16(T46_0A, T46_1A);
    T46 = _mm_add_epi16(T46, T46_0B);
    T46 = _mm_add_epi16(T46, T46_1B);
    T46 = _mm_add_epi16(T46, T46_0C);
    T46 = _mm_add_epi16(T46, T46_1C);
    T46 = _mm_add_epi16(T46, T46_0D);
    T46 = _mm_add_epi16(T46, T46_1D);

    T47_0A = _mm_abs_epi16(T47_0A);
    T47_1A = _mm_abs_epi16(T47_1A);
    T47_0B = _mm_abs_epi16(T47_0B);
    T47_1B = _mm_abs_epi16(T47_1B);
    T47_0C = _mm_abs_epi16(T47_0C);
    T47_1C = _mm_abs_epi16(T47_1C);
    T47_0D = _mm_abs_epi16(T47_0D);
    T47_1D = _mm_abs_epi16(T47_1D);
    T47 = _mm_add_epi16(T47_0A, T47_1A);
    T47 = _mm_add_epi16(T47, T47_0B);
    T47 = _mm_add_epi16(T47, T47_1B);
    T47 = _mm_add_epi16(T47, T47_0C);
    T47 = _mm_add_epi16(T47, T47_1C);
    T47 = _mm_add_epi16(T47, T47_0D);
    T47 = _mm_add_epi16(T47, T47_1D);

    T48_0A = _mm_abs_epi16(T48_0A);
    T48_1A = _mm_abs_epi16(T48_1A);
    T48_0B = _mm_abs_epi16(T48_0B);
    T48_1B = _mm_abs_epi16(T48_1B);
    T48_0C = _mm_abs_epi16(T48_0C);
    T48_1C = _mm_abs_epi16(T48_1C);
    T48_0D = _mm_abs_epi16(T48_0D);
    T48_1D = _mm_abs_epi16(T48_1D);
    T48 = _mm_add_epi16(T48_0A, T48_1A);
    T48 = _mm_add_epi16(T48, T48_0B);
    T48 = _mm_add_epi16(T48, T48_1B);
    T48 = _mm_add_epi16(T48, T48_0C);
    T48 = _mm_add_epi16(T48, T48_1C);
    T48 = _mm_add_epi16(T48, T48_0D);
    T48 = _mm_add_epi16(T48, T48_1D);

    T49_0A = _mm_abs_epi16(T49_0A);
    T49_1A = _mm_abs_epi16(T49_1A);
    T49_0B = _mm_abs_epi16(T49_0B);
    T49_1B = _mm_abs_epi16(T49_1B);
    T49_0C = _mm_abs_epi16(T49_0C);
    T49_1C = _mm_abs_epi16(T49_1C);
    T49_0D = _mm_abs_epi16(T49_0D);
    T49_1D = _mm_abs_epi16(T49_1D);
    T49 = _mm_add_epi16(T49_0A, T49_1A);
    T49 = _mm_add_epi16(T49, T49_0B);
    T49 = _mm_add_epi16(T49, T49_1B);
    T49 = _mm_add_epi16(T49, T49_0C);
    T49 = _mm_add_epi16(T49, T49_1C);
    T49 = _mm_add_epi16(T49, T49_0D);
    T49 = _mm_add_epi16(T49, T49_1D);

    T50_0A = _mm_abs_epi16(T50_0A);
    T50_1A = _mm_abs_epi16(T50_1A);
    T50_0B = _mm_abs_epi16(T50_0B);
    T50_1B = _mm_abs_epi16(T50_1B);
    T50_0C = _mm_abs_epi16(T50_0C);
    T50_1C = _mm_abs_epi16(T50_1C);
    T50_0D = _mm_abs_epi16(T50_0D);
    T50_1D = _mm_abs_epi16(T50_1D);
    T50 = _mm_add_epi16(T50_0A, T50_1A);
    T50 = _mm_add_epi16(T50, T50_0B);
    T50 = _mm_add_epi16(T50, T50_1B);
    T50 = _mm_add_epi16(T50, T50_0C);
    T50 = _mm_add_epi16(T50, T50_1C);
    T50 = _mm_add_epi16(T50, T50_0D);
    T50 = _mm_add_epi16(T50, T50_1D);

    T51_0A = _mm_abs_epi16(T51_0A);
    T51_1A = _mm_abs_epi16(T51_1A);
    T51_0B = _mm_abs_epi16(T51_0B);
    T51_1B = _mm_abs_epi16(T51_1B);
    T51_0C = _mm_abs_epi16(T51_0C);
    T51_1C = _mm_abs_epi16(T51_1C);
    T51_0D = _mm_abs_epi16(T51_0D);
    T51_1D = _mm_abs_epi16(T51_1D);
    T51 = _mm_add_epi16(T51_0A, T51_1A);
    T51 = _mm_add_epi16(T51, T51_0B);
    T51 = _mm_add_epi16(T51, T51_1B);
    T51 = _mm_add_epi16(T51, T51_0C);
    T51 = _mm_add_epi16(T51, T51_1C);
    T51 = _mm_add_epi16(T51, T51_0D);
    T51 = _mm_add_epi16(T51, T51_1D);

    T52_0A = _mm_abs_epi16(T52_0A);
    T52_1A = _mm_abs_epi16(T52_1A);
    T52_0B = _mm_abs_epi16(T52_0B);
    T52_1B = _mm_abs_epi16(T52_1B);
    T52_0C = _mm_abs_epi16(T52_0C);
    T52_1C = _mm_abs_epi16(T52_1C);
    T52_0D = _mm_abs_epi16(T52_0D);
    T52_1D = _mm_abs_epi16(T52_1D);
    T52 = _mm_add_epi16(T52_0A, T52_1A);
    T52 = _mm_add_epi16(T52, T52_0B);
    T52 = _mm_add_epi16(T52, T52_1B);
    T52 = _mm_add_epi16(T52, T52_0C);
    T52 = _mm_add_epi16(T52, T52_1C);
    T52 = _mm_add_epi16(T52, T52_0D);
    T52 = _mm_add_epi16(T52, T52_1D);

    T53_0A = _mm_abs_epi16(T53_0A);
    T53_1A = _mm_abs_epi16(T53_1A);
    T53_0B = _mm_abs_epi16(T53_0B);
    T53_1B = _mm_abs_epi16(T53_1B);
    T53_0C = _mm_abs_epi16(T53_0C);
    T53_1C = _mm_abs_epi16(T53_1C);
    T53_0D = _mm_abs_epi16(T53_0D);
    T53_1D = _mm_abs_epi16(T53_1D);
    T53 = _mm_add_epi16(T53_0A, T53_1A);
    T53 = _mm_add_epi16(T53, T53_0B);
    T53 = _mm_add_epi16(T53, T53_1B);
    T53 = _mm_add_epi16(T53, T53_0C);
    T53 = _mm_add_epi16(T53, T53_1C);
    T53 = _mm_add_epi16(T53, T53_0D);
    T53 = _mm_add_epi16(T53, T53_1D);

    T54_0A = _mm_abs_epi16(T54_0A);
    T54_1A = _mm_abs_epi16(T54_1A);
    T54_0B = _mm_abs_epi16(T54_0B);
    T54_1B = _mm_abs_epi16(T54_1B);
    T54_0C = _mm_abs_epi16(T54_0C);
    T54_1C = _mm_abs_epi16(T54_1C);
    T54_0D = _mm_abs_epi16(T54_0D);
    T54_1D = _mm_abs_epi16(T54_1D);
    T54 = _mm_add_epi16(T54_0A, T54_1A);
    T54 = _mm_add_epi16(T54, T54_0B);
    T54 = _mm_add_epi16(T54, T54_1B);
    T54 = _mm_add_epi16(T54, T54_0C);
    T54 = _mm_add_epi16(T54, T54_1C);
    T54 = _mm_add_epi16(T54, T54_0D);
    T54 = _mm_add_epi16(T54, T54_1D);

    T55_0A = _mm_abs_epi16(T55_0A);
    T55_1A = _mm_abs_epi16(T55_1A);
    T55_0B = _mm_abs_epi16(T55_0B);
    T55_1B = _mm_abs_epi16(T55_1B);
    T55_0C = _mm_abs_epi16(T55_0C);
    T55_1C = _mm_abs_epi16(T55_1C);
    T55_0D = _mm_abs_epi16(T55_0D);
    T55_1D = _mm_abs_epi16(T55_1D);
    T55 = _mm_add_epi16(T55_0A, T55_1A);
    T55 = _mm_add_epi16(T55, T55_0B);
    T55 = _mm_add_epi16(T55, T55_1B);
    T55 = _mm_add_epi16(T55, T55_0C);
    T55 = _mm_add_epi16(T55, T55_1C);
    T55 = _mm_add_epi16(T55, T55_0D);
    T55 = _mm_add_epi16(T55, T55_1D);

    T56_0A = _mm_abs_epi16(T56_0A);
    T56_1A = _mm_abs_epi16(T56_1A);
    T56_0B = _mm_abs_epi16(T56_0B);
    T56_1B = _mm_abs_epi16(T56_1B);
    T56_0C = _mm_abs_epi16(T56_0C);
    T56_1C = _mm_abs_epi16(T56_1C);
    T56_0D = _mm_abs_epi16(T56_0D);
    T56_1D = _mm_abs_epi16(T56_1D);
    T56 = _mm_add_epi16(T56_0A, T56_1A);
    T56 = _mm_add_epi16(T56, T56_0B);
    T56 = _mm_add_epi16(T56, T56_1B);
    T56 = _mm_add_epi16(T56, T56_0C);
    T56 = _mm_add_epi16(T56, T56_1C);
    T56 = _mm_add_epi16(T56, T56_0D);
    T56 = _mm_add_epi16(T56, T56_1D);

    T57_0A = _mm_abs_epi16(T57_0A);
    T57_1A = _mm_abs_epi16(T57_1A);
    T57_0B = _mm_abs_epi16(T57_0B);
    T57_1B = _mm_abs_epi16(T57_1B);
    T57_0C = _mm_abs_epi16(T57_0C);
    T57_1C = _mm_abs_epi16(T57_1C);
    T57_0D = _mm_abs_epi16(T57_0D);
    T57_1D = _mm_abs_epi16(T57_1D);
    T57 = _mm_add_epi16(T57_0A, T57_1A);
    T57 = _mm_add_epi16(T57, T57_0B);
    T57 = _mm_add_epi16(T57, T57_1B);
    T57 = _mm_add_epi16(T57, T57_0C);
    T57 = _mm_add_epi16(T57, T57_1C);
    T57 = _mm_add_epi16(T57, T57_0D);
    T57 = _mm_add_epi16(T57, T57_1D);

    T58_0A = _mm_abs_epi16(T58_0A);
    T58_1A = _mm_abs_epi16(T58_1A);
    T58_0B = _mm_abs_epi16(T58_0B);
    T58_1B = _mm_abs_epi16(T58_1B);
    T58_0C = _mm_abs_epi16(T58_0C);
    T58_1C = _mm_abs_epi16(T58_1C);
    T58_0D = _mm_abs_epi16(T58_0D);
    T58_1D = _mm_abs_epi16(T58_1D);
    T58 = _mm_add_epi16(T58_0A, T58_1A);
    T58 = _mm_add_epi16(T58, T58_0B);
    T58 = _mm_add_epi16(T58, T58_1B);
    T58 = _mm_add_epi16(T58, T58_0C);
    T58 = _mm_add_epi16(T58, T58_1C);
    T58 = _mm_add_epi16(T58, T58_0D);
    T58 = _mm_add_epi16(T58, T58_1D);

    T59_0A = _mm_abs_epi16(T59_0A);
    T59_1A = _mm_abs_epi16(T59_1A);
    T59_0B = _mm_abs_epi16(T59_0B);
    T59_1B = _mm_abs_epi16(T59_1B);
    T59_0C = _mm_abs_epi16(T59_0C);
    T59_1C = _mm_abs_epi16(T59_1C);
    T59_0D = _mm_abs_epi16(T59_0D);
    T59_1D = _mm_abs_epi16(T59_1D);
    T59 = _mm_add_epi16(T59_0A, T59_1A);
    T59 = _mm_add_epi16(T59, T59_0B);
    T59 = _mm_add_epi16(T59, T59_1B);
    T59 = _mm_add_epi16(T59, T59_0C);
    T59 = _mm_add_epi16(T59, T59_1C);
    T59 = _mm_add_epi16(T59, T59_0D);
    T59 = _mm_add_epi16(T59, T59_1D);

    T60_0A = _mm_abs_epi16(T60_0A);
    T60_1A = _mm_abs_epi16(T60_1A);
    T60_0B = _mm_abs_epi16(T60_0B);
    T60_1B = _mm_abs_epi16(T60_1B);
    T60_0C = _mm_abs_epi16(T60_0C);
    T60_1C = _mm_abs_epi16(T60_1C);
    T60_0D = _mm_abs_epi16(T60_0D);
    T60_1D = _mm_abs_epi16(T60_1D);
    T60 = _mm_add_epi16(T60_0A, T60_1A);
    T60 = _mm_add_epi16(T60, T60_0B);
    T60 = _mm_add_epi16(T60, T60_1B);
    T60 = _mm_add_epi16(T60, T60_0C);
    T60 = _mm_add_epi16(T60, T60_1C);
    T60 = _mm_add_epi16(T60, T60_0D);
    T60 = _mm_add_epi16(T60, T60_1D);

    T61_0A = _mm_abs_epi16(T61_0A);
    T61_1A = _mm_abs_epi16(T61_1A);
    T61_0B = _mm_abs_epi16(T61_0B);
    T61_1B = _mm_abs_epi16(T61_1B);
    T61_0C = _mm_abs_epi16(T61_0C);
    T61_1C = _mm_abs_epi16(T61_1C);
    T61_0D = _mm_abs_epi16(T61_0D);
    T61_1D = _mm_abs_epi16(T61_1D);
    T61 = _mm_add_epi16(T61_0A, T61_1A);
    T61 = _mm_add_epi16(T61, T61_0B);
    T61 = _mm_add_epi16(T61, T61_1B);
    T61 = _mm_add_epi16(T61, T61_0C);
    T61 = _mm_add_epi16(T61, T61_1C);
    T61 = _mm_add_epi16(T61, T61_0D);
    T61 = _mm_add_epi16(T61, T61_1D);

    T62_0A = _mm_abs_epi16(T62_0A);
    T62_1A = _mm_abs_epi16(T62_1A);
    T62_0B = _mm_abs_epi16(T62_0B);
    T62_1B = _mm_abs_epi16(T62_1B);
    T62_0C = _mm_abs_epi16(T62_0C);
    T62_1C = _mm_abs_epi16(T62_1C);
    T62_0D = _mm_abs_epi16(T62_0D);
    T62_1D = _mm_abs_epi16(T62_1D);
    T62 = _mm_add_epi16(T62_0A, T62_1A);
    T62 = _mm_add_epi16(T62, T62_0B);
    T62 = _mm_add_epi16(T62, T62_1B);
    T62 = _mm_add_epi16(T62, T62_0C);
    T62 = _mm_add_epi16(T62, T62_1C);
    T62 = _mm_add_epi16(T62, T62_0D);
    T62 = _mm_add_epi16(T62, T62_1D);

    T63_0A = _mm_abs_epi16(T63_0A);
    T63_1A = _mm_abs_epi16(T63_1A);
    T63_0B = _mm_abs_epi16(T63_0B);
    T63_1B = _mm_abs_epi16(T63_1B);
    T63_0C = _mm_abs_epi16(T63_0C);
    T63_1C = _mm_abs_epi16(T63_1C);
    T63_0D = _mm_abs_epi16(T63_0D);
    T63_1D = _mm_abs_epi16(T63_1D);
    T63 = _mm_add_epi16(T63_0A, T63_1A);
    T63 = _mm_add_epi16(T63, T63_0B);
    T63 = _mm_add_epi16(T63, T63_1B);
    T63 = _mm_add_epi16(T63, T63_0C);
    T63 = _mm_add_epi16(T63, T63_1C);
    T63 = _mm_add_epi16(T63, T63_0D);
    T63 = _mm_add_epi16(T63, T63_1D);


    S = _mm_add_epi16(T0, T1);
    S = _mm_add_epi16(S, T2);
    S = _mm_add_epi16(S, T3);
    S = _mm_add_epi16(S, T4);
    S = _mm_add_epi16(S, T5);
    S = _mm_add_epi16(S, T6);
    S = _mm_add_epi16(S, T7);
    S = _mm_add_epi16(S, T8);
    S = _mm_add_epi16(S, T9);
    S = _mm_add_epi16(S, T10);
    S = _mm_add_epi16(S, T11);
    S = _mm_add_epi16(S, T12);
    S = _mm_add_epi16(S, T13);
    S = _mm_add_epi16(S, T14);
    S = _mm_add_epi16(S, T15);

    M1 = _mm_add_epi16(T16, T17);
    M1 = _mm_add_epi16(M1, T18);
    M1 = _mm_add_epi16(M1, T19);
    M1 = _mm_add_epi16(M1, T20);
    M1 = _mm_add_epi16(M1, T21);
    M1 = _mm_add_epi16(M1, T22);
    M1 = _mm_add_epi16(M1, T23);
    M1 = _mm_add_epi16(M1, T24);
    M1 = _mm_add_epi16(M1, T25);
    M1 = _mm_add_epi16(M1, T26);
    M1 = _mm_add_epi16(M1, T27);
    M1 = _mm_add_epi16(M1, T28);
    M1 = _mm_add_epi16(M1, T29);
    M1 = _mm_add_epi16(M1, T30);
    M1 = _mm_add_epi16(M1, T31);

    M2 = _mm_add_epi16(T32, T33);
    M2 = _mm_add_epi16(M2, T34);
    M2 = _mm_add_epi16(M2, T35);
    M2 = _mm_add_epi16(M2, T36);
    M2 = _mm_add_epi16(M2, T37);
    M2 = _mm_add_epi16(M2, T38);
    M2 = _mm_add_epi16(M2, T39);
    M2 = _mm_add_epi16(M2, T40);
    M2 = _mm_add_epi16(M2, T41);
    M2 = _mm_add_epi16(M2, T42);
    M2 = _mm_add_epi16(M2, T43);
    M2 = _mm_add_epi16(M2, T44);
    M2 = _mm_add_epi16(M2, T45);
    M2 = _mm_add_epi16(M2, T46);
    M2 = _mm_add_epi16(M2, T47);

    M = _mm_add_epi16(T48, T49);
    M = _mm_add_epi16(M, T50);
    M = _mm_add_epi16(M, T51);
    M = _mm_add_epi16(M, T52);
    M = _mm_add_epi16(M, T53);
    M = _mm_add_epi16(M, T54);
    M = _mm_add_epi16(M, T55);
    M = _mm_add_epi16(M, T56);
    M = _mm_add_epi16(M, T57);
    M = _mm_add_epi16(M, T58);
    M = _mm_add_epi16(M, T59);
    M = _mm_add_epi16(M, T60);
    M = _mm_add_epi16(M, T61);
    M = _mm_add_epi16(M, T62);
    M = _mm_add_epi16(M, T63);

    mads = M128_U16(S, 0) + M128_U16(S, 1) + M128_U16(S, 2) + M128_U16(S, 3) + M128_U16(S, 4) + M128_U16(S, 5) + M128_U16(S, 6) + M128_U16(S, 7);
    mad1 = M128_U16(M1, 0) + M128_U16(M1, 1) + M128_U16(M1, 2) + M128_U16(M1, 3) + M128_U16(M1, 4) + M128_U16(M1, 5) + M128_U16(M1, 6) + M128_U16(M1, 7);
    mad2 = M128_U16(M2, 0) + M128_U16(M2, 1) + M128_U16(M2, 2) + M128_U16(M2, 3) + M128_U16(M2, 4) + M128_U16(M2, 5) + M128_U16(M2, 6) + M128_U16(M2, 7);
    mad = M128_U16(M, 0) + M128_U16(M, 1) + M128_U16(M, 2) + M128_U16(M, 3) + M128_U16(M, 4) + M128_U16(M, 5) + M128_U16(M, 6) + M128_U16(M, 7);
    mad = mads + mad1 + mad2 + mad;

    return mad;
}



